koichi12 commited on
Commit
0efc066
·
verified ·
1 Parent(s): bcc798f

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +3 -0
  2. .venv/lib/python3.11/site-packages/click/__pycache__/core.cpython-311.pyc +3 -0
  3. .venv/lib/python3.11/site-packages/nvidia/__init__.py +0 -0
  4. .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__init__.py +0 -0
  5. .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h +78 -0
  6. .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h +282 -0
  7. .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_defines.h +65 -0
  8. .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/mma.h +60 -0
  9. .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp +92 -0
  10. .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h +106 -0
  11. .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_types.h +180 -0
  12. .venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp +316 -0
  13. .venv/lib/python3.11/site-packages/nvidia/curand/__init__.py +0 -0
  14. .venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc +0 -0
  15. .venv/lib/python3.11/site-packages/nvidia/curand/include/__init__.py +0 -0
  16. .venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc +0 -0
  17. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand.h +1077 -0
  18. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h +87 -0
  19. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h +253 -0
  20. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_globals.h +93 -0
  21. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_kernel.h +1677 -0
  22. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_lognormal.h +697 -0
  23. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mrg32k3a.h +0 -0
  24. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h +210 -0
  25. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_host.h +516 -0
  26. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h +386 -0
  27. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32dc_p_11213.h +0 -0
  28. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h +840 -0
  29. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal_static.h +134 -0
  30. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_philox4x32_x.h +195 -0
  31. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_poisson.h +763 -0
  32. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_precalc.h +0 -0
  33. .venv/lib/python3.11/site-packages/nvidia/curand/include/curand_uniform.h +498 -0
  34. .venv/lib/python3.11/site-packages/nvidia/curand/lib/__init__.py +0 -0
  35. .venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc +0 -0
  36. .venv/lib/python3.11/site-packages/nvidia/nvtx/__init__.py +0 -0
  37. .venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc +0 -0
  38. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/__init__.py +0 -0
  39. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/__pycache__/__init__.cpython-311.pyc +0 -0
  40. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExt.h +1561 -0
  41. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCuda.h +164 -0
  42. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCudaRt.h +140 -0
  43. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtOpenCL.h +214 -0
  44. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtSync.h +406 -0
  45. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExt.h +1499 -0
  46. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtCuda.h +170 -0
  47. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtCudaRt.h +146 -0
  48. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtOpenCL.h +220 -0
  49. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtSync.h +411 -0
  50. .venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h +469 -0
.gitattributes CHANGED
@@ -116,3 +116,6 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
116
  .venv/lib/python3.11/site-packages/torchvision.libs/libz.5f199d92.so.1 filter=lfs diff=lfs merge=lfs -text
117
  .venv/lib/python3.11/site-packages/torchvision.libs/libjpeg.ceea7512.so.62 filter=lfs diff=lfs merge=lfs -text
118
  .venv/lib/python3.11/site-packages/attr/__pycache__/_make.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 
 
 
 
116
  .venv/lib/python3.11/site-packages/torchvision.libs/libz.5f199d92.so.1 filter=lfs diff=lfs merge=lfs -text
117
  .venv/lib/python3.11/site-packages/torchvision.libs/libjpeg.ceea7512.so.62 filter=lfs diff=lfs merge=lfs -text
118
  .venv/lib/python3.11/site-packages/attr/__pycache__/_make.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
119
+ .venv/lib/python3.11/site-packages/opencv_python_headless.libs/libgfortran-91cc3cb1.so.3.0.0 filter=lfs diff=lfs merge=lfs -text
120
+ .venv/lib/python3.11/site-packages/click/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
121
+ .venv/lib/python3.11/site-packages/pyasn1/type/__pycache__/univ.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
.venv/lib/python3.11/site-packages/click/__pycache__/core.cpython-311.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:484629aab5363f8454a60a903003e4f5a00aa9ce88c11751116cfec8fcae7c8b
3
+ size 142553
.venv/lib/python3.11/site-packages/nvidia/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2020-2021 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CUDAPROFILERTYPEDEFS_H
51
+ #define CUDAPROFILERTYPEDEFS_H
52
+
53
+ #include <cudaProfiler.h>
54
+
55
+ #ifdef __cplusplus
56
+ extern "C" {
57
+ #endif // __cplusplus
58
+
59
+ /*
60
+ * Macros for the latest version for each driver function in cudaProfiler.h
61
+ */
62
+ #define PFN_cuProfilerInitialize PFN_cuProfilerInitialize_v4000
63
+ #define PFN_cuProfilerStart PFN_cuProfilerStart_v4000
64
+ #define PFN_cuProfilerStop PFN_cuProfilerStop_v4000
65
+
66
+
67
+ /**
68
+ * Type definitions for functions defined in cudaProfiler.h
69
+ */
70
+ typedef CUresult (CUDAAPI *PFN_cuProfilerInitialize_v4000)(const char *configFile, const char *outputFile, CUoutput_mode outputMode);
71
+ typedef CUresult (CUDAAPI *PFN_cuProfilerStart_v4000)(void);
72
+ typedef CUresult (CUDAAPI *PFN_cuProfilerStop_v4000)(void);
73
+
74
+ #ifdef __cplusplus
75
+ }
76
+ #endif // __cplusplus
77
+
78
+ #endif // file guard
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CUDAVDPAU_H
51
+ #define CUDAVDPAU_H
52
+
53
+ #ifdef CUDA_FORCE_API_VERSION
54
+ #error "CUDA_FORCE_API_VERSION is no longer supported."
55
+ #endif
56
+
57
+ #define cuVDPAUCtxCreate cuVDPAUCtxCreate_v2
58
+
59
+ #ifdef __cplusplus
60
+ extern "C" {
61
+ #endif
62
+
63
+ /**
64
+ * \defgroup CUDA_VDPAU VDPAU Interoperability
65
+ * \ingroup CUDA_DRIVER
66
+ *
67
+ * ___MANBRIEF___ VDPAU interoperability functions of the low-level CUDA driver
68
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
69
+ *
70
+ * This section describes the VDPAU interoperability functions of the
71
+ * low-level CUDA driver application programming interface.
72
+ *
73
+ * @{
74
+ */
75
+
76
+ /**
77
+ * \brief Gets the CUDA device associated with a VDPAU device
78
+ *
79
+ * Returns in \p *pDevice the CUDA device associated with a \p vdpDevice, if
80
+ * applicable.
81
+ *
82
+ * \param pDevice - Device associated with vdpDevice
83
+ * \param vdpDevice - A VdpDevice handle
84
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
85
+ *
86
+ * \return
87
+ * ::CUDA_SUCCESS,
88
+ * ::CUDA_ERROR_DEINITIALIZED,
89
+ * ::CUDA_ERROR_NOT_INITIALIZED,
90
+ * ::CUDA_ERROR_INVALID_CONTEXT,
91
+ * ::CUDA_ERROR_INVALID_VALUE
92
+ * \notefnerr
93
+ *
94
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
95
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
96
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
97
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
98
+ * ::cudaVDPAUGetDevice
99
+ */
100
+ CUresult CUDAAPI cuVDPAUGetDevice(CUdevice *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
101
+
102
+ /**
103
+ * \brief Create a CUDA context for interoperability with VDPAU
104
+ *
105
+ * Creates a new CUDA context, initializes VDPAU interoperability, and
106
+ * associates the CUDA context with the calling thread. It must be called
107
+ * before performing any other VDPAU interoperability operations. It may fail
108
+ * if the needed VDPAU driver facilities are not available. For usage of the
109
+ * \p flags parameter, see ::cuCtxCreate().
110
+ *
111
+ * \param pCtx - Returned CUDA context
112
+ * \param flags - Options for CUDA context creation
113
+ * \param device - Device on which to create the context
114
+ * \param vdpDevice - The VdpDevice to interop with
115
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
116
+ *
117
+ * \return
118
+ * ::CUDA_SUCCESS,
119
+ * ::CUDA_ERROR_DEINITIALIZED,
120
+ * ::CUDA_ERROR_NOT_INITIALIZED,
121
+ * ::CUDA_ERROR_INVALID_CONTEXT,
122
+ * ::CUDA_ERROR_INVALID_VALUE,
123
+ * ::CUDA_ERROR_OUT_OF_MEMORY
124
+ * \notefnerr
125
+ *
126
+ * \sa ::cuCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
127
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
128
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
129
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
130
+ * ::cuVDPAUGetDevice
131
+ */
132
+ CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
133
+
134
+ /**
135
+ * \brief Registers a VDPAU VdpVideoSurface object
136
+ *
137
+ * Registers the VdpVideoSurface specified by \p vdpSurface for access by
138
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
139
+ * The surface's intended usage is specified using \p flags, as follows:
140
+ *
141
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
142
+ * resource will be used. It is therefore assumed that this resource will be
143
+ * read from and written to by CUDA. This is the default value.
144
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
145
+ * will not write to this resource.
146
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
147
+ * CUDA will not read from this resource and will write over the
148
+ * entire contents of the resource, so none of the data previously
149
+ * stored in the resource will be preserved.
150
+ *
151
+ * The VdpVideoSurface is presented as an array of subresources that may be
152
+ * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
153
+ * The exact number of valid \p arrayIndex values depends on the VDPAU surface
154
+ * format. The mapping is shown in the table below. \p mipLevel must be 0.
155
+ *
156
+ * \htmlonly
157
+ * <table>
158
+ * <tr><th>VdpChromaType </th><th>arrayIndex</th><th>Size </th><th>Format</th><th>Content </th></tr>
159
+ * <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_420</td><td>0 </td><td>w x h/2</td><td>R8 </td><td>Top-field luma </td></tr>
160
+ * <tr> <td>1 </td><td>w x h/2</td><td>R8 </td><td>Bottom-field luma </td></tr>
161
+ * <tr> <td>2 </td><td>w/2 x h/4</td><td>R8G8 </td><td>Top-field chroma </td></tr>
162
+ * <tr> <td>3 </td><td>w/2 x h/4</td><td>R8G8 </td><td>Bottom-field chroma</td></tr>
163
+ * <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_422</td><td>0 </td><td>w x h/2</td><td>R8 </td><td>Top-field luma </td></tr>
164
+ * <tr> <td>1 </td><td>w x h/2</td><td>R8 </td><td>Bottom-field luma </td></tr>
165
+ * <tr> <td>2 </td><td>w/2 x h/2</td><td>R8G8 </td><td>Top-field chroma </td></tr>
166
+ * <tr> <td>3 </td><td>w/2 x h/2</td><td>R8G8 </td><td>Bottom-field chroma</td></tr>
167
+ * </table>
168
+ * \endhtmlonly
169
+ *
170
+ * \latexonly
171
+ * \begin{tabular}{|l|l|l|l|l|}
172
+ * \hline
173
+ * VdpChromaType & arrayIndex & Size & Format & Content \\
174
+ * \hline
175
+ * VDP\_CHROMA\_TYPE\_420 & 0 & w x h/2 & R8 & Top-field luma \\
176
+ * & 1 & w x h/2 & R8 & Bottom-field luma \\
177
+ * & 2 & w/2 x h/4 & R8G8 & Top-field chroma \\
178
+ * & 3 & w/2 x h/4 & R8G8 & Bottom-field chroma \\
179
+ * \hline
180
+ * VDP\_CHROMA\_TYPE\_422 & 0 & w x h/2 & R8 & Top-field luma \\
181
+ * & 1 & w x h/2 & R8 & Bottom-field luma \\
182
+ * & 2 & w/2 x h/2 & R8G8 & Top-field chroma \\
183
+ * & 3 & w/2 x h/2 & R8G8 & Bottom-field chroma \\
184
+ * \hline
185
+ * \end{tabular}
186
+ * \endlatexonly
187
+ *
188
+ * \param pCudaResource - Pointer to the returned object handle
189
+ * \param vdpSurface - The VdpVideoSurface to be registered
190
+ * \param flags - Map flags
191
+ *
192
+ * \return
193
+ * ::CUDA_SUCCESS,
194
+ * ::CUDA_ERROR_INVALID_HANDLE,
195
+ * ::CUDA_ERROR_ALREADY_MAPPED,
196
+ * ::CUDA_ERROR_INVALID_CONTEXT,
197
+ * \notefnerr
198
+ *
199
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
200
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
201
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
202
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
203
+ * ::cuVDPAUGetDevice,
204
+ * ::cudaGraphicsVDPAURegisterVideoSurface
205
+ */
206
+ CUresult CUDAAPI cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
207
+
208
+ /**
209
+ * \brief Registers a VDPAU VdpOutputSurface object
210
+ *
211
+ * Registers the VdpOutputSurface specified by \p vdpSurface for access by
212
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
213
+ * The surface's intended usage is specified using \p flags, as follows:
214
+ *
215
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
216
+ * resource will be used. It is therefore assumed that this resource will be
217
+ * read from and written to by CUDA. This is the default value.
218
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
219
+ * will not write to this resource.
220
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
221
+ * CUDA will not read from this resource and will write over the
222
+ * entire contents of the resource, so none of the data previously
223
+ * stored in the resource will be preserved.
224
+ *
225
+ * The VdpOutputSurface is presented as an array of subresources that may be
226
+ * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
227
+ * The exact number of valid \p arrayIndex values depends on the VDPAU surface
228
+ * format. The mapping is shown in the table below. \p mipLevel must be 0.
229
+ *
230
+ * \htmlonly
231
+ * <table>
232
+ * <tr><th>VdpRGBAFormat </th><th>arrayIndex</th><th>Size </th><th>Format </th><th>Content </th></tr>
233
+ * <tr><td>VDP_RGBA_FORMAT_B8G8R8A8 </td><td>0 </td><td>w x h</td><td>ARGB8 </td><td>Entire surface</td></tr>
234
+ * <tr><td>VDP_RGBA_FORMAT_R10G10B10A2</td><td>0 </td><td>w x h</td><td>A2BGR10</td><td>Entire surface</td></tr>
235
+ * </table>
236
+ * \endhtmlonly
237
+ *
238
+ * \latexonly
239
+ * \begin{tabular}{|l|l|l|l|l|}
240
+ * \hline
241
+ * VdpRGBAFormat & arrayIndex & Size & Format & Content \\
242
+ * \hline
243
+ * VDP\_RGBA\_FORMAT\_B8G8R8A8 & 0 & w x h & ARGB8 & Entire surface \\
244
+ * VDP\_RGBA\_FORMAT\_R10G10B10A2 & 0 & w x h & A2BGR10 & Entire surface \\
245
+ * \hline
246
+ * \end{tabular}
247
+ * \endlatexonly
248
+ *
249
+ * \param pCudaResource - Pointer to the returned object handle
250
+ * \param vdpSurface - The VdpOutputSurface to be registered
251
+ * \param flags - Map flags
252
+ *
253
+ * \return
254
+ * ::CUDA_SUCCESS,
255
+ * ::CUDA_ERROR_INVALID_HANDLE,
256
+ * ::CUDA_ERROR_ALREADY_MAPPED,
257
+ * ::CUDA_ERROR_INVALID_CONTEXT,
258
+ * \notefnerr
259
+ *
260
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
261
+ * ::cuGraphicsVDPAURegisterVideoSurface, ::cuGraphicsUnregisterResource,
262
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
263
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
264
+ * ::cuVDPAUGetDevice,
265
+ * ::cudaGraphicsVDPAURegisterOutputSurface
266
+ */
267
+ CUresult CUDAAPI cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
268
+
269
+ /** @} */ /* END CUDA_VDPAU */
270
+
271
+
272
+ #if defined(__CUDA_API_VERSION_INTERNAL)
273
+ #undef cuVDPAUCtxCreate
274
+
275
+ CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
276
+ #endif /* __CUDA_API_VERSION_INTERNAL */
277
+
278
+ #ifdef __cplusplus
279
+ };
280
+ #endif
281
+
282
+ #endif
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_defines.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("host_defines.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "host_defines.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
58
+ #endif
59
+
60
+ #include "crt/host_defines.h"
61
+
62
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__)
63
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
64
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
65
+ #endif
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/mma.h ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
52
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
53
+ #endif
54
+
55
+ #include "crt/mma.h"
56
+
57
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__)
58
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
59
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
60
+ #endif
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_20_ATOMIC_FUNCTIONS_HPP__)
51
+ #define __SM_20_ATOMIC_FUNCTIONS_HPP__
52
+
53
+ #if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
54
+ extern "C"
55
+ {
56
+ extern __device__ __device_builtin__ float __fAtomicAdd(float *address, float val);
57
+ }
58
+ #endif /* __CUDA_ARCH__ */
59
+
60
+ #if defined(__CUDACC_RTC__)
61
+ #define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
62
+ #else /* __CUDACC_RTC__ */
63
+ #define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
64
+ #endif /* __CUDACC_RTC__ */
65
+
66
+ #if defined(__cplusplus) && defined(__CUDACC__)
67
+
68
+ /*******************************************************************************
69
+ * *
70
+ * *
71
+ * *
72
+ *******************************************************************************/
73
+
74
+ #include "cuda_runtime_api.h"
75
+
76
+ /*******************************************************************************
77
+ * *
78
+ * *
79
+ * *
80
+ *******************************************************************************/
81
+
82
+ __SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val)
83
+ {
84
+ return __fAtomicAdd(address, val);
85
+ }
86
+
87
+ #endif /* __cplusplus && __CUDACC__ */
88
+
89
+ #undef __SM_20_ATOMIC_FUNCTIONS_DECL__
90
+
91
+ #endif /* !__SM_20_ATOMIC_FUNCTIONS_HPP__ */
92
+
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+
3
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
4
+
5
+ *
6
+
7
+ * NOTICE TO LICENSEE:
8
+
9
+ *
10
+
11
+ * This source code and/or documentation ("Licensed Deliverables") are
12
+
13
+ * subject to NVIDIA intellectual property rights under U.S. and
14
+
15
+ * international Copyright laws.
16
+
17
+ *
18
+
19
+ * These Licensed Deliverables contained herein is PROPRIETARY and
20
+
21
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
22
+
23
+ * conditions of a form of NVIDIA software license agreement by and
24
+
25
+ * between NVIDIA and Licensee ("License Agreement") or electronically
26
+
27
+ * accepted by Licensee. Notwithstanding any terms or conditions to
28
+
29
+ * the contrary in the License Agreement, reproduction or disclosure
30
+
31
+ * of the Licensed Deliverables to any third party without the express
32
+
33
+ * written consent of NVIDIA is prohibited.
34
+
35
+ *
36
+
37
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
38
+
39
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
40
+
41
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
42
+
43
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
44
+
45
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
46
+
47
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
48
+
49
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
50
+
51
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
52
+
53
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
54
+
55
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
56
+
57
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
58
+
59
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
60
+
61
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
62
+
63
+ * OF THESE LICENSED DELIVERABLES.
64
+
65
+ *
66
+
67
+ * U.S. Government End Users. These Licensed Deliverables are a
68
+
69
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
70
+
71
+ * 1995), consisting of "commercial computer software" and "commercial
72
+
73
+ * computer software documentation" as such terms are used in 48
74
+
75
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
76
+
77
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
78
+
79
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
80
+
81
+ * U.S. Government End Users acquire the Licensed Deliverables with
82
+
83
+ * only those rights set forth herein.
84
+
85
+ *
86
+
87
+ * Any use of the Licensed Deliverables in individual and commercial
88
+
89
+ * software must include, in the user documentation and internal
90
+
91
+ * comments to the code, the above Disclaimer and U.S. Government End
92
+
93
+ * Users Notice.
94
+
95
+ */
96
+
97
+
98
+
99
+ #if !defined(__SM_35_INTRINSICS_H__)
100
+ #define __SM_35_INTRINSICS_H__
101
+
102
+
103
+
104
+
105
+ #endif /* !__SM_35_INTRINSICS_H__ */
106
+
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_types.h ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__TEXTURE_TYPES_H__)
51
+ #define __TEXTURE_TYPES_H__
52
+
53
+ /*******************************************************************************
54
+ * *
55
+ * *
56
+ * *
57
+ *******************************************************************************/
58
+
59
+ #include "driver_types.h"
60
+
61
+ #ifndef __CUDACC_RTC_MINIMAL__
62
+
63
+ /**
64
+ * \addtogroup CUDART_TYPES
65
+ *
66
+ * @{
67
+ */
68
+
69
+ /*******************************************************************************
70
+ * *
71
+ * *
72
+ * *
73
+ *******************************************************************************/
74
+
75
+ #define cudaTextureType1D 0x01
76
+ #define cudaTextureType2D 0x02
77
+ #define cudaTextureType3D 0x03
78
+ #define cudaTextureTypeCubemap 0x0C
79
+ #define cudaTextureType1DLayered 0xF1
80
+ #define cudaTextureType2DLayered 0xF2
81
+ #define cudaTextureTypeCubemapLayered 0xFC
82
+
83
+ /**
84
+ * CUDA texture address modes
85
+ */
86
+ enum __device_builtin__ cudaTextureAddressMode
87
+ {
88
+ cudaAddressModeWrap = 0, /**< Wrapping address mode */
89
+ cudaAddressModeClamp = 1, /**< Clamp to edge address mode */
90
+ cudaAddressModeMirror = 2, /**< Mirror address mode */
91
+ cudaAddressModeBorder = 3 /**< Border address mode */
92
+ };
93
+
94
+ /**
95
+ * CUDA texture filter modes
96
+ */
97
+ enum __device_builtin__ cudaTextureFilterMode
98
+ {
99
+ cudaFilterModePoint = 0, /**< Point filter mode */
100
+ cudaFilterModeLinear = 1 /**< Linear filter mode */
101
+ };
102
+
103
+ /**
104
+ * CUDA texture read modes
105
+ */
106
+ enum __device_builtin__ cudaTextureReadMode
107
+ {
108
+ cudaReadModeElementType = 0, /**< Read texture as specified element type */
109
+ cudaReadModeNormalizedFloat = 1 /**< Read texture as normalized float */
110
+ };
111
+
112
+ /**
113
+ * CUDA texture descriptor
114
+ */
115
+ struct __device_builtin__ cudaTextureDesc
116
+ {
117
+ /**
118
+ * Texture address mode for up to 3 dimensions
119
+ */
120
+ enum cudaTextureAddressMode addressMode[3];
121
+ /**
122
+ * Texture filter mode
123
+ */
124
+ enum cudaTextureFilterMode filterMode;
125
+ /**
126
+ * Texture read mode
127
+ */
128
+ enum cudaTextureReadMode readMode;
129
+ /**
130
+ * Perform sRGB->linear conversion during texture read
131
+ */
132
+ int sRGB;
133
+ /**
134
+ * Texture Border Color
135
+ */
136
+ float borderColor[4];
137
+ /**
138
+ * Indicates whether texture reads are normalized or not
139
+ */
140
+ int normalizedCoords;
141
+ /**
142
+ * Limit to the anisotropy ratio
143
+ */
144
+ unsigned int maxAnisotropy;
145
+ /**
146
+ * Mipmap filter mode
147
+ */
148
+ enum cudaTextureFilterMode mipmapFilterMode;
149
+ /**
150
+ * Offset applied to the supplied mipmap level
151
+ */
152
+ float mipmapLevelBias;
153
+ /**
154
+ * Lower end of the mipmap level range to clamp access to
155
+ */
156
+ float minMipmapLevelClamp;
157
+ /**
158
+ * Upper end of the mipmap level range to clamp access to
159
+ */
160
+ float maxMipmapLevelClamp;
161
+ /**
162
+ * Disable any trilinear filtering optimizations.
163
+ */
164
+ int disableTrilinearOptimization;
165
+ /**
166
+ * Enable seamless cube map filtering.
167
+ */
168
+ int seamlessCubemap;
169
+ };
170
+
171
+ /**
172
+ * An opaque value that represents a CUDA texture object
173
+ */
174
+ typedef __device_builtin__ unsigned long long cudaTextureObject_t;
175
+
176
+ /** @} */
177
+ /** @} */ /* END CUDART_TYPES */
178
+
179
+ #endif /* !__CUDACC_RTC_MINIMAL__ */
180
+ #endif /* !__TEXTURE_TYPES_H__ */
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp ADDED
@@ -0,0 +1,316 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__VECTOR_FUNCTIONS_HPP__)
51
+ #define __VECTOR_FUNCTIONS_HPP__
52
+
53
+ /*******************************************************************************
54
+ * *
55
+ * *
56
+ * *
57
+ *******************************************************************************/
58
+
59
+ #include "cuda_runtime_api.h"
60
+
61
+ #if defined(__CUDACC_RTC__)
62
+ #define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
63
+ #else /* !__CUDACC_RTC__ */
64
+ #define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
65
+ #endif /* __CUDACC_RTC__ */
66
+
67
+ /*******************************************************************************
68
+ * *
69
+ * *
70
+ * *
71
+ *******************************************************************************/
72
+
73
+ __VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
74
+ {
75
+ char1 t; t.x = x; return t;
76
+ }
77
+
78
+ __VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
79
+ {
80
+ uchar1 t; t.x = x; return t;
81
+ }
82
+
83
+ __VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
84
+ {
85
+ char2 t; t.x = x; t.y = y; return t;
86
+ }
87
+
88
+ __VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
89
+ {
90
+ uchar2 t; t.x = x; t.y = y; return t;
91
+ }
92
+
93
+ __VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
94
+ {
95
+ char3 t; t.x = x; t.y = y; t.z = z; return t;
96
+ }
97
+
98
+ __VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
99
+ {
100
+ uchar3 t; t.x = x; t.y = y; t.z = z; return t;
101
+ }
102
+
103
+ __VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
104
+ {
105
+ char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
106
+ }
107
+
108
+ __VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
109
+ {
110
+ uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
111
+ }
112
+
113
+ __VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
114
+ {
115
+ short1 t; t.x = x; return t;
116
+ }
117
+
118
+ __VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
119
+ {
120
+ ushort1 t; t.x = x; return t;
121
+ }
122
+
123
+ __VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
124
+ {
125
+ short2 t; t.x = x; t.y = y; return t;
126
+ }
127
+
128
+ __VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
129
+ {
130
+ ushort2 t; t.x = x; t.y = y; return t;
131
+ }
132
+
133
+ __VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
134
+ {
135
+ short3 t; t.x = x; t.y = y; t.z = z; return t;
136
+ }
137
+
138
+ __VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
139
+ {
140
+ ushort3 t; t.x = x; t.y = y; t.z = z; return t;
141
+ }
142
+
143
+ __VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
144
+ {
145
+ short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
146
+ }
147
+
148
+ __VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
149
+ {
150
+ ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
151
+ }
152
+
153
+ __VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
154
+ {
155
+ int1 t; t.x = x; return t;
156
+ }
157
+
158
+ __VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
159
+ {
160
+ uint1 t; t.x = x; return t;
161
+ }
162
+
163
+ __VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
164
+ {
165
+ int2 t; t.x = x; t.y = y; return t;
166
+ }
167
+
168
+ __VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
169
+ {
170
+ uint2 t; t.x = x; t.y = y; return t;
171
+ }
172
+
173
+ __VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
174
+ {
175
+ int3 t; t.x = x; t.y = y; t.z = z; return t;
176
+ }
177
+
178
+ __VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
179
+ {
180
+ uint3 t; t.x = x; t.y = y; t.z = z; return t;
181
+ }
182
+
183
+ __VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
184
+ {
185
+ int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
186
+ }
187
+
188
+ __VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
189
+ {
190
+ uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
191
+ }
192
+
193
+ __VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
194
+ {
195
+ long1 t; t.x = x; return t;
196
+ }
197
+
198
+ __VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
199
+ {
200
+ ulong1 t; t.x = x; return t;
201
+ }
202
+
203
+ __VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
204
+ {
205
+ long2 t; t.x = x; t.y = y; return t;
206
+ }
207
+
208
+ __VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
209
+ {
210
+ ulong2 t; t.x = x; t.y = y; return t;
211
+ }
212
+
213
+ __VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
214
+ {
215
+ long3 t; t.x = x; t.y = y; t.z = z; return t;
216
+ }
217
+
218
+ __VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
219
+ {
220
+ ulong3 t; t.x = x; t.y = y; t.z = z; return t;
221
+ }
222
+
223
+ __VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
224
+ {
225
+ long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
226
+ }
227
+
228
+ __VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
229
+ {
230
+ ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
231
+ }
232
+
233
+ __VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
234
+ {
235
+ float1 t; t.x = x; return t;
236
+ }
237
+
238
+ __VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
239
+ {
240
+ float2 t; t.x = x; t.y = y; return t;
241
+ }
242
+
243
+ __VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
244
+ {
245
+ float3 t; t.x = x; t.y = y; t.z = z; return t;
246
+ }
247
+
248
+ __VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
249
+ {
250
+ float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
251
+ }
252
+
253
+ __VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
254
+ {
255
+ longlong1 t; t.x = x; return t;
256
+ }
257
+
258
+ __VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
259
+ {
260
+ ulonglong1 t; t.x = x; return t;
261
+ }
262
+
263
+ __VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
264
+ {
265
+ longlong2 t; t.x = x; t.y = y; return t;
266
+ }
267
+
268
+ __VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
269
+ {
270
+ ulonglong2 t; t.x = x; t.y = y; return t;
271
+ }
272
+
273
+ __VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
274
+ {
275
+ longlong3 t; t.x = x; t.y = y; t.z = z; return t;
276
+ }
277
+
278
+ __VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
279
+ {
280
+ ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
281
+ }
282
+
283
+ __VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
284
+ {
285
+ longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
286
+ }
287
+
288
+ __VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
289
+ {
290
+ ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
291
+ }
292
+
293
+ __VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
294
+ {
295
+ double1 t; t.x = x; return t;
296
+ }
297
+
298
+ __VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
299
+ {
300
+ double2 t; t.x = x; t.y = y; return t;
301
+ }
302
+
303
+ __VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
304
+ {
305
+ double3 t; t.x = x; t.y = y; t.z = z; return t;
306
+ }
307
+
308
+ __VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
309
+ {
310
+ double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
311
+ }
312
+
313
+ #undef __VECTOR_FUNCTIONS_DECL__
314
+
315
+ #endif /* !__VECTOR_FUNCTIONS_HPP__ */
316
+
.venv/lib/python3.11/site-packages/nvidia/curand/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (186 Bytes). View file
 
.venv/lib/python3.11/site-packages/nvidia/curand/include/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (194 Bytes). View file
 
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand.h ADDED
@@ -0,0 +1,1077 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * The source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * The Licensed Deliverables contained herein are PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(CURAND_H_)
51
+ #define CURAND_H_
52
+
53
+ /**
54
+ * \defgroup HOST Host API
55
+ *
56
+ * @{
57
+ */
58
+ #ifndef __CUDACC_RTC__
59
+ #include <cuda_runtime.h>
60
+ #endif
61
+
62
+ #ifndef CURANDAPI
63
+ #ifdef _WIN32
64
+ #define CURANDAPI __stdcall
65
+ #else
66
+ #define CURANDAPI
67
+ #endif
68
+ #endif
69
+
70
+ #if defined(__cplusplus)
71
+ extern "C" {
72
+ #endif /* __cplusplus */
73
+
74
+ #define CURAND_VER_MAJOR 10
75
+ #define CURAND_VER_MINOR 3
76
+ #define CURAND_VER_PATCH 5
77
+ #define CURAND_VER_BUILD 147
78
+ #define CURAND_VERSION (CURAND_VER_MAJOR * 1000 + \
79
+ CURAND_VER_MINOR * 100 + \
80
+ CURAND_VER_PATCH)
81
+ /* CURAND Host API datatypes */
82
+
83
+ /**
84
+ * @{
85
+ */
86
+
87
+ /**
88
+ * CURAND function call status types
89
+ */
90
+ enum curandStatus {
91
+ CURAND_STATUS_SUCCESS = 0, ///< No errors
92
+ CURAND_STATUS_VERSION_MISMATCH = 100, ///< Header file and linked library version do not match
93
+ CURAND_STATUS_NOT_INITIALIZED = 101, ///< Generator not initialized
94
+ CURAND_STATUS_ALLOCATION_FAILED = 102, ///< Memory allocation failed
95
+ CURAND_STATUS_TYPE_ERROR = 103, ///< Generator is wrong type
96
+ CURAND_STATUS_OUT_OF_RANGE = 104, ///< Argument out of range
97
+ CURAND_STATUS_LENGTH_NOT_MULTIPLE = 105, ///< Length requested is not a multple of dimension
98
+ CURAND_STATUS_DOUBLE_PRECISION_REQUIRED = 106, ///< GPU does not have double precision required by MRG32k3a
99
+ CURAND_STATUS_LAUNCH_FAILURE = 201, ///< Kernel launch failure
100
+ CURAND_STATUS_PREEXISTING_FAILURE = 202, ///< Preexisting failure on library entry
101
+ CURAND_STATUS_INITIALIZATION_FAILED = 203, ///< Initialization of CUDA failed
102
+ CURAND_STATUS_ARCH_MISMATCH = 204, ///< Architecture mismatch, GPU does not support requested feature
103
+ CURAND_STATUS_INTERNAL_ERROR = 999 ///< Internal library error
104
+ };
105
+
106
+ /*
107
+ * CURAND function call status types
108
+ */
109
+ /** \cond UNHIDE_TYPEDEFS */
110
+ typedef enum curandStatus curandStatus_t;
111
+ /** \endcond */
112
+
113
+ /**
114
+ * CURAND generator types
115
+ */
116
+ enum curandRngType {
117
+ CURAND_RNG_TEST = 0,
118
+ CURAND_RNG_PSEUDO_DEFAULT = 100, ///< Default pseudorandom generator
119
+ CURAND_RNG_PSEUDO_XORWOW = 101, ///< XORWOW pseudorandom generator
120
+ CURAND_RNG_PSEUDO_MRG32K3A = 121, ///< MRG32k3a pseudorandom generator
121
+ CURAND_RNG_PSEUDO_MTGP32 = 141, ///< Mersenne Twister MTGP32 pseudorandom generator
122
+ CURAND_RNG_PSEUDO_MT19937 = 142, ///< Mersenne Twister MT19937 pseudorandom generator
123
+ CURAND_RNG_PSEUDO_PHILOX4_32_10 = 161, ///< PHILOX-4x32-10 pseudorandom generator
124
+ CURAND_RNG_QUASI_DEFAULT = 200, ///< Default quasirandom generator
125
+ CURAND_RNG_QUASI_SOBOL32 = 201, ///< Sobol32 quasirandom generator
126
+ CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 202, ///< Scrambled Sobol32 quasirandom generator
127
+ CURAND_RNG_QUASI_SOBOL64 = 203, ///< Sobol64 quasirandom generator
128
+ CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 204 ///< Scrambled Sobol64 quasirandom generator
129
+ };
130
+
131
+ /*
132
+ * CURAND generator types
133
+ */
134
+ /** \cond UNHIDE_TYPEDEFS */
135
+ typedef enum curandRngType curandRngType_t;
136
+ /** \endcond */
137
+
138
+ /**
139
+ * CURAND ordering of results in memory
140
+ */
141
+ enum curandOrdering {
142
+ CURAND_ORDERING_PSEUDO_BEST = 100, ///< Best ordering for pseudorandom results
143
+ CURAND_ORDERING_PSEUDO_DEFAULT = 101, ///< Specific default thread sequence for pseudorandom results, same as CURAND_ORDERING_PSEUDO_BEST
144
+ CURAND_ORDERING_PSEUDO_SEEDED = 102, ///< Specific seeding pattern for fast lower quality pseudorandom results
145
+ CURAND_ORDERING_PSEUDO_LEGACY = 103, ///< Specific legacy sequence for pseudorandom results, guaranteed to remain the same for all cuRAND release
146
+ CURAND_ORDERING_PSEUDO_DYNAMIC = 104, ///< Specific ordering adjusted to the device it is being executed on, provides the best performance
147
+ CURAND_ORDERING_QUASI_DEFAULT = 201 ///< Specific n-dimensional ordering for quasirandom results
148
+ };
149
+
150
+ /*
151
+ * CURAND ordering of results in memory
152
+ */
153
+ /** \cond UNHIDE_TYPEDEFS */
154
+ typedef enum curandOrdering curandOrdering_t;
155
+ /** \endcond */
156
+
157
+ /**
158
+ * CURAND choice of direction vector set
159
+ */
160
+ enum curandDirectionVectorSet {
161
+ CURAND_DIRECTION_VECTORS_32_JOEKUO6 = 101, ///< Specific set of 32-bit direction vectors generated from polynomials recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions
162
+ CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 = 102, ///< Specific set of 32-bit direction vectors generated from polynomials recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions, and scrambled
163
+ CURAND_DIRECTION_VECTORS_64_JOEKUO6 = 103, ///< Specific set of 64-bit direction vectors generated from polynomials recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions
164
+ CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 = 104 ///< Specific set of 64-bit direction vectors generated from polynomials recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions, and scrambled
165
+ };
166
+
167
+ /*
168
+ * CURAND choice of direction vector set
169
+ */
170
+ /** \cond UNHIDE_TYPEDEFS */
171
+ typedef enum curandDirectionVectorSet curandDirectionVectorSet_t;
172
+ /** \endcond */
173
+
174
+ /**
175
+ * CURAND array of 32-bit direction vectors
176
+ */
177
+ /** \cond UNHIDE_TYPEDEFS */
178
+ typedef unsigned int curandDirectionVectors32_t[32];
179
+ /** \endcond */
180
+
181
+ /**
182
+ * CURAND array of 64-bit direction vectors
183
+ */
184
+ /** \cond UNHIDE_TYPEDEFS */
185
+ typedef unsigned long long curandDirectionVectors64_t[64];
186
+ /** \endcond **/
187
+
188
+ /**
189
+ * CURAND generator (opaque)
190
+ */
191
+ struct curandGenerator_st;
192
+
193
+ /**
194
+ * CURAND generator
195
+ */
196
+ /** \cond UNHIDE_TYPEDEFS */
197
+ typedef struct curandGenerator_st *curandGenerator_t;
198
+ /** \endcond */
199
+
200
+ /**
201
+ * CURAND distribution
202
+ */
203
+ /** \cond UNHIDE_TYPEDEFS */
204
+ typedef double curandDistribution_st;
205
+ typedef curandDistribution_st *curandDistribution_t;
206
+ typedef struct curandDistributionShift_st *curandDistributionShift_t;
207
+ /** \endcond */
208
+ /**
209
+ * CURAND distribution M2
210
+ */
211
+ /** \cond UNHIDE_TYPEDEFS */
212
+ typedef struct curandDistributionM2Shift_st *curandDistributionM2Shift_t;
213
+ typedef struct curandHistogramM2_st *curandHistogramM2_t;
214
+ typedef unsigned int curandHistogramM2K_st;
215
+ typedef curandHistogramM2K_st *curandHistogramM2K_t;
216
+ typedef curandDistribution_st curandHistogramM2V_st;
217
+ typedef curandHistogramM2V_st *curandHistogramM2V_t;
218
+
219
+ typedef struct curandDiscreteDistribution_st *curandDiscreteDistribution_t;
220
+ /** \endcond */
221
+
222
+ /*
223
+ * CURAND METHOD
224
+ */
225
+ /** \cond UNHIDE_ENUMS */
226
+ enum curandMethod {
227
+ CURAND_CHOOSE_BEST = 0, // choose best depends on args
228
+ CURAND_ITR = 1,
229
+ CURAND_KNUTH = 2,
230
+ CURAND_HITR = 3,
231
+ CURAND_M1 = 4,
232
+ CURAND_M2 = 5,
233
+ CURAND_BINARY_SEARCH = 6,
234
+ CURAND_DISCRETE_GAUSS = 7,
235
+ CURAND_REJECTION = 8,
236
+ CURAND_DEVICE_API = 9,
237
+ CURAND_FAST_REJECTION = 10,
238
+ CURAND_3RD = 11,
239
+ CURAND_DEFINITION = 12,
240
+ CURAND_POISSON = 13
241
+ };
242
+
243
+ typedef enum curandMethod curandMethod_t;
244
+ /** \endcond */
245
+
246
+
247
+ #ifndef __CUDACC_RTC__
248
+
249
+ /**
250
+ * @}
251
+ */
252
+
253
+ /**
254
+ * \brief Create new random number generator.
255
+ *
256
+ * Creates a new random number generator of type \p rng_type
257
+ * and returns it in \p *generator.
258
+ *
259
+ * Legal values for \p rng_type are:
260
+ * - CURAND_RNG_PSEUDO_DEFAULT
261
+ * - CURAND_RNG_PSEUDO_XORWOW
262
+ * - CURAND_RNG_PSEUDO_MRG32K3A
263
+ * - CURAND_RNG_PSEUDO_MTGP32
264
+ * - CURAND_RNG_PSEUDO_MT19937
265
+ * - CURAND_RNG_PSEUDO_PHILOX4_32_10
266
+ * - CURAND_RNG_QUASI_DEFAULT
267
+ * - CURAND_RNG_QUASI_SOBOL32
268
+ * - CURAND_RNG_QUASI_SCRAMBLED_SOBOL32
269
+ * - CURAND_RNG_QUASI_SOBOL64
270
+ * - CURAND_RNG_QUASI_SCRAMBLED_SOBOL64
271
+ *
272
+ * When \p rng_type is CURAND_RNG_PSEUDO_DEFAULT, the type chosen
273
+ * is CURAND_RNG_PSEUDO_XORWOW. \n
274
+ * When \p rng_type is CURAND_RNG_QUASI_DEFAULT,
275
+ * the type chosen is CURAND_RNG_QUASI_SOBOL32.
276
+ *
277
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_XORWOW are:
278
+ * - \p seed = 0
279
+ * - \p offset = 0
280
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
281
+ *
282
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_MRG32K3A are:
283
+ * - \p seed = 0
284
+ * - \p offset = 0
285
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
286
+ *
287
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_MTGP32 are:
288
+ * - \p seed = 0
289
+ * - \p offset = 0
290
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
291
+ *
292
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_MT19937 are:
293
+ * - \p seed = 0
294
+ * - \p offset = 0
295
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
296
+ *
297
+ * * The default values for \p rng_type = CURAND_RNG_PSEUDO_PHILOX4_32_10 are:
298
+ * - \p seed = 0
299
+ * - \p offset = 0
300
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
301
+ *
302
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SOBOL32 are:
303
+ * - \p dimensions = 1
304
+ * - \p offset = 0
305
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
306
+ *
307
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SOBOL64 are:
308
+ * - \p dimensions = 1
309
+ * - \p offset = 0
310
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
311
+ *
312
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SCRAMBBLED_SOBOL32 are:
313
+ * - \p dimensions = 1
314
+ * - \p offset = 0
315
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
316
+ *
317
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 are:
318
+ * - \p dimensions = 1
319
+ * - \p offset = 0
320
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
321
+ *
322
+ * \param generator - Pointer to generator
323
+ * \param rng_type - Type of generator to create
324
+ *
325
+ * \return
326
+ * - CURAND_STATUS_ALLOCATION_FAILED, if memory could not be allocated \n
327
+ * - CURAND_STATUS_INITIALIZATION_FAILED if there was a problem setting up the GPU \n
328
+ * - CURAND_STATUS_VERSION_MISMATCH if the header file version does not match the
329
+ * dynamically linked library version \n
330
+ * - CURAND_STATUS_TYPE_ERROR if the value for \p rng_type is invalid \n
331
+ * - CURAND_STATUS_SUCCESS if generator was created successfully \n
332
+ *
333
+ */
334
+ curandStatus_t CURANDAPI
335
+ curandCreateGenerator(curandGenerator_t *generator, curandRngType_t rng_type);
336
+
337
+ /**
338
+ * \brief Create new host CPU random number generator.
339
+ *
340
+ * Creates a new host CPU random number generator of type \p rng_type
341
+ * and returns it in \p *generator.
342
+ *
343
+ * Legal values for \p rng_type are:
344
+ * - CURAND_RNG_PSEUDO_DEFAULT
345
+ * - CURAND_RNG_PSEUDO_XORWOW
346
+ * - CURAND_RNG_PSEUDO_MRG32K3A
347
+ * - CURAND_RNG_PSEUDO_MTGP32
348
+ * - CURAND_RNG_PSEUDO_MT19937
349
+ * - CURAND_RNG_PSEUDO_PHILOX4_32_10
350
+ * - CURAND_RNG_QUASI_DEFAULT
351
+ * - CURAND_RNG_QUASI_SOBOL32
352
+ *
353
+ * When \p rng_type is CURAND_RNG_PSEUDO_DEFAULT, the type chosen
354
+ * is CURAND_RNG_PSEUDO_XORWOW. \n
355
+ * When \p rng_type is CURAND_RNG_QUASI_DEFAULT,
356
+ * the type chosen is CURAND_RNG_QUASI_SOBOL32.
357
+ *
358
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_XORWOW are:
359
+ * - \p seed = 0
360
+ * - \p offset = 0
361
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
362
+ *
363
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_MRG32K3A are:
364
+ * - \p seed = 0
365
+ * - \p offset = 0
366
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
367
+ *
368
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_MTGP32 are:
369
+ * - \p seed = 0
370
+ * - \p offset = 0
371
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
372
+ *
373
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_MT19937 are:
374
+ * - \p seed = 0
375
+ * - \p offset = 0
376
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
377
+ *
378
+ * * The default values for \p rng_type = CURAND_RNG_PSEUDO_PHILOX4_32_10 are:
379
+ * - \p seed = 0
380
+ * - \p offset = 0
381
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
382
+ *
383
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SOBOL32 are:
384
+ * - \p dimensions = 1
385
+ * - \p offset = 0
386
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
387
+ *
388
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SOBOL64 are:
389
+ * - \p dimensions = 1
390
+ * - \p offset = 0
391
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
392
+ *
393
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 are:
394
+ * - \p dimensions = 1
395
+ * - \p offset = 0
396
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
397
+ *
398
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 are:
399
+ * - \p dimensions = 1
400
+ * - \p offset = 0
401
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
402
+ *
403
+ * \param generator - Pointer to generator
404
+ * \param rng_type - Type of generator to create
405
+ *
406
+ * \return
407
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
408
+ * - CURAND_STATUS_INITIALIZATION_FAILED if there was a problem setting up the GPU \n
409
+ * - CURAND_STATUS_VERSION_MISMATCH if the header file version does not match the
410
+ * dynamically linked library version \n
411
+ * - CURAND_STATUS_TYPE_ERROR if the value for \p rng_type is invalid \n
412
+ * - CURAND_STATUS_SUCCESS if generator was created successfully \n
413
+ */
414
+ curandStatus_t CURANDAPI
415
+ curandCreateGeneratorHost(curandGenerator_t *generator, curandRngType_t rng_type);
416
+
417
+ /**
418
+ * \brief Destroy an existing generator.
419
+ *
420
+ * Destroy an existing generator and free all memory associated with its state.
421
+ *
422
+ * \param generator - Generator to destroy
423
+ *
424
+ * \return
425
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
426
+ * - CURAND_STATUS_SUCCESS if generator was destroyed successfully \n
427
+ */
428
+ curandStatus_t CURANDAPI
429
+ curandDestroyGenerator(curandGenerator_t generator);
430
+
431
+ /**
432
+ * \brief Return the version number of the library.
433
+ *
434
+ * Return in \p *version the version number of the dynamically linked CURAND
435
+ * library. The format is the same as CUDART_VERSION from the CUDA Runtime.
436
+ * The only supported configuration is CURAND version equal to CUDA Runtime
437
+ * version.
438
+ *
439
+ * \param version - CURAND library version
440
+ *
441
+ * \return
442
+ * - CURAND_STATUS_SUCCESS if the version number was successfully returned \n
443
+ */
444
+ curandStatus_t CURANDAPI
445
+ curandGetVersion(int *version);
446
+
447
+ /**
448
+ * \brief Return the value of the curand property.
449
+ *
450
+ * Return in \p *value the number for the property described by \p type of the
451
+ * dynamically linked CURAND library.
452
+ *
453
+ * \param type - CUDA library property
454
+ * \param value - integer value for the requested property
455
+ *
456
+ * \return
457
+ * - CURAND_STATUS_SUCCESS if the property value was successfully returned \n
458
+ * - CURAND_STATUS_OUT_OF_RANGE if the property type is not recognized \n
459
+ */
460
+ curandStatus_t CURANDAPI
461
+ curandGetProperty(libraryPropertyType type, int *value);
462
+
463
+
464
+ /**
465
+ * \brief Set the current stream for CURAND kernel launches.
466
+ *
467
+ * Set the current stream for CURAND kernel launches. All library functions
468
+ * will use this stream until set again.
469
+ *
470
+ * \param generator - Generator to modify
471
+ * \param stream - Stream to use or ::NULL for null stream
472
+ *
473
+ * \return
474
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
475
+ * - CURAND_STATUS_SUCCESS if stream was set successfully \n
476
+ */
477
+ curandStatus_t CURANDAPI
478
+ curandSetStream(curandGenerator_t generator, cudaStream_t stream);
479
+
480
+ /**
481
+ * \brief Set the seed value of the pseudo-random number generator.
482
+ *
483
+ * Set the seed value of the pseudorandom number generator.
484
+ * All values of seed are valid. Different seeds will produce different sequences.
485
+ * Different seeds will often not be statistically correlated with each other,
486
+ * but some pairs of seed values may generate sequences which are statistically correlated.
487
+ *
488
+ * \param generator - Generator to modify
489
+ * \param seed - Seed value
490
+ *
491
+ * \return
492
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
493
+ * - CURAND_STATUS_TYPE_ERROR if the generator is not a pseudorandom number generator \n
494
+ * - CURAND_STATUS_SUCCESS if generator seed was set successfully \n
495
+ */
496
+ curandStatus_t CURANDAPI
497
+ curandSetPseudoRandomGeneratorSeed(curandGenerator_t generator, unsigned long long seed);
498
+
499
+ /**
500
+ * \brief Set the absolute offset of the pseudo or quasirandom number generator.
501
+ *
502
+ * Set the absolute offset of the pseudo or quasirandom number generator.
503
+ *
504
+ * All values of offset are valid. The offset position is absolute, not
505
+ * relative to the current position in the sequence.
506
+ *
507
+ * \param generator - Generator to modify
508
+ * \param offset - Absolute offset position
509
+ *
510
+ * \return
511
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
512
+ * - CURAND_STATUS_SUCCESS if generator offset was set successfully \n
513
+ */
514
+ curandStatus_t CURANDAPI
515
+ curandSetGeneratorOffset(curandGenerator_t generator, unsigned long long offset);
516
+
517
+ /**
518
+ * \brief Set the ordering of results of the pseudo or quasirandom number generator.
519
+ *
520
+ * Set the ordering of results of the pseudo or quasirandom number generator.
521
+ *
522
+ * Legal values of \p order for pseudorandom generators are:
523
+ * - CURAND_ORDERING_PSEUDO_DEFAULT
524
+ * - CURAND_ORDERING_PSEUDO_BEST
525
+ * - CURAND_ORDERING_PSEUDO_SEEDED
526
+ * - CURAND_ORDERING_PSEUDO_LEGACY
527
+ *
528
+ * Legal values of \p order for quasirandom generators are:
529
+ * - CURAND_ORDERING_QUASI_DEFAULT
530
+ *
531
+ * \param generator - Generator to modify
532
+ * \param order - Ordering of results
533
+ *
534
+ * \return
535
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
536
+ * - CURAND_STATUS_OUT_OF_RANGE if the ordering is not valid \n
537
+ * - CURAND_STATUS_SUCCESS if generator ordering was set successfully \n
538
+ */
539
+ curandStatus_t CURANDAPI
540
+ curandSetGeneratorOrdering(curandGenerator_t generator, curandOrdering_t order);
541
+
542
+ /**
543
+ * \brief Set the number of dimensions.
544
+ *
545
+ * Set the number of dimensions to be generated by the quasirandom number
546
+ * generator.
547
+ *
548
+ * Legal values for \p num_dimensions are 1 to 20000.
549
+ *
550
+ * \param generator - Generator to modify
551
+ * \param num_dimensions - Number of dimensions
552
+ *
553
+ * \return
554
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
555
+ * - CURAND_STATUS_OUT_OF_RANGE if num_dimensions is not valid \n
556
+ * - CURAND_STATUS_TYPE_ERROR if the generator is not a quasirandom number generator \n
557
+ * - CURAND_STATUS_SUCCESS if generator ordering was set successfully \n
558
+ */
559
+ curandStatus_t CURANDAPI
560
+ curandSetQuasiRandomGeneratorDimensions(curandGenerator_t generator, unsigned int num_dimensions);
561
+
562
+ /**
563
+ * \brief Generate 32-bit pseudo or quasirandom numbers.
564
+ *
565
+ * Use \p generator to generate \p num 32-bit results into the device memory at
566
+ * \p outputPtr. The device memory must have been previously allocated and be
567
+ * large enough to hold all the results. Launches are done with the stream
568
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
569
+ *
570
+ * Results are 32-bit values with every bit random.
571
+ *
572
+ * \param generator - Generator to use
573
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
574
+ * Pointer to host memory to store CPU-generated results
575
+ * \param num - Number of random 32-bit values to generate
576
+ *
577
+ * \return
578
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
579
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
580
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
581
+ * a previous kernel launch \n
582
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
583
+ * not a multiple of the quasirandom dimension \n
584
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
585
+ * - CURAND_STATUS_TYPE_ERROR if the generator is a 64 bit quasirandom generator.
586
+ * (use ::curandGenerateLongLong() with 64 bit quasirandom generators)
587
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
588
+ */
589
+ curandStatus_t CURANDAPI
590
+ curandGenerate(curandGenerator_t generator, unsigned int *outputPtr, size_t num);
591
+
592
+ /**
593
+ * \brief Generate 64-bit quasirandom numbers.
594
+ *
595
+ * Use \p generator to generate \p num 64-bit results into the device memory at
596
+ * \p outputPtr. The device memory must have been previously allocated and be
597
+ * large enough to hold all the results. Launches are done with the stream
598
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
599
+ *
600
+ * Results are 64-bit values with every bit random.
601
+ *
602
+ * \param generator - Generator to use
603
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
604
+ * Pointer to host memory to store CPU-generated results
605
+ * \param num - Number of random 64-bit values to generate
606
+ *
607
+ * \return
608
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
609
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
610
+ * a previous kernel launch \n
611
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
612
+ * not a multiple of the quasirandom dimension \n
613
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
614
+ * - CURAND_STATUS_TYPE_ERROR if the generator is not a 64 bit quasirandom generator\n
615
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
616
+ */
617
+ curandStatus_t CURANDAPI
618
+ curandGenerateLongLong(curandGenerator_t generator, unsigned long long *outputPtr, size_t num);
619
+
620
+ /**
621
+ * \brief Generate uniformly distributed floats.
622
+ *
623
+ * Use \p generator to generate \p num float results into the device memory at
624
+ * \p outputPtr. The device memory must have been previously allocated and be
625
+ * large enough to hold all the results. Launches are done with the stream
626
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
627
+ *
628
+ * Results are 32-bit floating point values between \p 0.0f and \p 1.0f,
629
+ * excluding \p 0.0f and including \p 1.0f.
630
+ *
631
+ * \param generator - Generator to use
632
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
633
+ * Pointer to host memory to store CPU-generated results
634
+ * \param num - Number of floats to generate
635
+ *
636
+ * \return
637
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
638
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
639
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
640
+ * a previous kernel launch \n
641
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
642
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
643
+ * not a multiple of the quasirandom dimension \n
644
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
645
+ */
646
+ curandStatus_t CURANDAPI
647
+ curandGenerateUniform(curandGenerator_t generator, float *outputPtr, size_t num);
648
+
649
+ /**
650
+ * \brief Generate uniformly distributed doubles.
651
+ *
652
+ * Use \p generator to generate \p num double results into the device memory at
653
+ * \p outputPtr. The device memory must have been previously allocated and be
654
+ * large enough to hold all the results. Launches are done with the stream
655
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
656
+ *
657
+ * Results are 64-bit double precision floating point values between
658
+ * \p 0.0 and \p 1.0, excluding \p 0.0 and including \p 1.0.
659
+ *
660
+ * \param generator - Generator to use
661
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
662
+ * Pointer to host memory to store CPU-generated results
663
+ * \param num - Number of doubles to generate
664
+ *
665
+ * \return
666
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
667
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
668
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
669
+ * a previous kernel launch \n
670
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
671
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
672
+ * not a multiple of the quasirandom dimension \n
673
+ * - CURAND_STATUS_DOUBLE_PRECISION_REQUIRED if the GPU does not support double precision \n
674
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
675
+ */
676
+ curandStatus_t CURANDAPI
677
+ curandGenerateUniformDouble(curandGenerator_t generator, double *outputPtr, size_t num);
678
+
679
+ /**
680
+ * \brief Generate normally distributed doubles.
681
+ *
682
+ * Use \p generator to generate \p n float results into the device memory at
683
+ * \p outputPtr. The device memory must have been previously allocated and be
684
+ * large enough to hold all the results. Launches are done with the stream
685
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
686
+ *
687
+ * Results are 32-bit floating point values with mean \p mean and standard
688
+ * deviation \p stddev.
689
+ *
690
+ * Normally distributed results are generated from pseudorandom generators
691
+ * with a Box-Muller transform, and so require \p n to be even.
692
+ * Quasirandom generators use an inverse cumulative distribution
693
+ * function to preserve dimensionality.
694
+ *
695
+ * There may be slight numerical differences between results generated
696
+ * on the GPU with generators created with ::curandCreateGenerator()
697
+ * and results calculated on the CPU with generators created with
698
+ * ::curandCreateGeneratorHost(). These differences arise because of
699
+ * differences in results for transcendental functions. In addition,
700
+ * future versions of CURAND may use newer versions of the CUDA math
701
+ * library, so different versions of CURAND may give slightly different
702
+ * numerical values.
703
+ *
704
+ * \param generator - Generator to use
705
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
706
+ * Pointer to host memory to store CPU-generated results
707
+ * \param n - Number of floats to generate
708
+ * \param mean - Mean of normal distribution
709
+ * \param stddev - Standard deviation of normal distribution
710
+ *
711
+ * \return
712
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
713
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
714
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
715
+ * a previous kernel launch \n
716
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
717
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
718
+ * not a multiple of the quasirandom dimension, or is not a multiple
719
+ * of two for pseudorandom generators \n
720
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
721
+ */
722
+ curandStatus_t CURANDAPI
723
+ curandGenerateNormal(curandGenerator_t generator, float *outputPtr,
724
+ size_t n, float mean, float stddev);
725
+
726
+ /**
727
+ * \brief Generate normally distributed doubles.
728
+ *
729
+ * Use \p generator to generate \p n double results into the device memory at
730
+ * \p outputPtr. The device memory must have been previously allocated and be
731
+ * large enough to hold all the results. Launches are done with the stream
732
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
733
+ *
734
+ * Results are 64-bit floating point values with mean \p mean and standard
735
+ * deviation \p stddev.
736
+ *
737
+ * Normally distributed results are generated from pseudorandom generators
738
+ * with a Box-Muller transform, and so require \p n to be even.
739
+ * Quasirandom generators use an inverse cumulative distribution
740
+ * function to preserve dimensionality.
741
+ *
742
+ * There may be slight numerical differences between results generated
743
+ * on the GPU with generators created with ::curandCreateGenerator()
744
+ * and results calculated on the CPU with generators created with
745
+ * ::curandCreateGeneratorHost(). These differences arise because of
746
+ * differences in results for transcendental functions. In addition,
747
+ * future versions of CURAND may use newer versions of the CUDA math
748
+ * library, so different versions of CURAND may give slightly different
749
+ * numerical values.
750
+ *
751
+ * \param generator - Generator to use
752
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
753
+ * Pointer to host memory to store CPU-generated results
754
+ * \param n - Number of doubles to generate
755
+ * \param mean - Mean of normal distribution
756
+ * \param stddev - Standard deviation of normal distribution
757
+ *
758
+ * \return
759
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
760
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
761
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
762
+ * a previous kernel launch \n
763
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
764
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
765
+ * not a multiple of the quasirandom dimension, or is not a multiple
766
+ * of two for pseudorandom generators \n
767
+ * - CURAND_STATUS_DOUBLE_PRECISION_REQUIRED if the GPU does not support double precision \n
768
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
769
+ */
770
+ curandStatus_t CURANDAPI
771
+ curandGenerateNormalDouble(curandGenerator_t generator, double *outputPtr,
772
+ size_t n, double mean, double stddev);
773
+
774
+ /**
775
+ * \brief Generate log-normally distributed floats.
776
+ *
777
+ * Use \p generator to generate \p n float results into the device memory at
778
+ * \p outputPtr. The device memory must have been previously allocated and be
779
+ * large enough to hold all the results. Launches are done with the stream
780
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
781
+ *
782
+ * Results are 32-bit floating point values with log-normal distribution based on
783
+ * an associated normal distribution with mean \p mean and standard deviation \p stddev.
784
+ *
785
+ * Normally distributed results are generated from pseudorandom generators
786
+ * with a Box-Muller transform, and so require \p n to be even.
787
+ * Quasirandom generators use an inverse cumulative distribution
788
+ * function to preserve dimensionality.
789
+ * The normally distributed results are transformed into log-normal distribution.
790
+ *
791
+ * There may be slight numerical differences between results generated
792
+ * on the GPU with generators created with ::curandCreateGenerator()
793
+ * and results calculated on the CPU with generators created with
794
+ * ::curandCreateGeneratorHost(). These differences arise because of
795
+ * differences in results for transcendental functions. In addition,
796
+ * future versions of CURAND may use newer versions of the CUDA math
797
+ * library, so different versions of CURAND may give slightly different
798
+ * numerical values.
799
+ *
800
+ * \param generator - Generator to use
801
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
802
+ * Pointer to host memory to store CPU-generated results
803
+ * \param n - Number of floats to generate
804
+ * \param mean - Mean of associated normal distribution
805
+ * \param stddev - Standard deviation of associated normal distribution
806
+ *
807
+ * \return
808
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
809
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
810
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
811
+ * a previous kernel launch \n
812
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
813
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
814
+ * not a multiple of the quasirandom dimension, or is not a multiple
815
+ * of two for pseudorandom generators \n
816
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
817
+ */
818
+ curandStatus_t CURANDAPI
819
+ curandGenerateLogNormal(curandGenerator_t generator, float *outputPtr,
820
+ size_t n, float mean, float stddev);
821
+
822
+ /**
823
+ * \brief Generate log-normally distributed doubles.
824
+ *
825
+ * Use \p generator to generate \p n double results into the device memory at
826
+ * \p outputPtr. The device memory must have been previously allocated and be
827
+ * large enough to hold all the results. Launches are done with the stream
828
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
829
+ *
830
+ * Results are 64-bit floating point values with log-normal distribution based on
831
+ * an associated normal distribution with mean \p mean and standard deviation \p stddev.
832
+ *
833
+ * Normally distributed results are generated from pseudorandom generators
834
+ * with a Box-Muller transform, and so require \p n to be even.
835
+ * Quasirandom generators use an inverse cumulative distribution
836
+ * function to preserve dimensionality.
837
+ * The normally distributed results are transformed into log-normal distribution.
838
+ *
839
+ * There may be slight numerical differences between results generated
840
+ * on the GPU with generators created with ::curandCreateGenerator()
841
+ * and results calculated on the CPU with generators created with
842
+ * ::curandCreateGeneratorHost(). These differences arise because of
843
+ * differences in results for transcendental functions. In addition,
844
+ * future versions of CURAND may use newer versions of the CUDA math
845
+ * library, so different versions of CURAND may give slightly different
846
+ * numerical values.
847
+ *
848
+ * \param generator - Generator to use
849
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
850
+ * Pointer to host memory to store CPU-generated results
851
+ * \param n - Number of doubles to generate
852
+ * \param mean - Mean of normal distribution
853
+ * \param stddev - Standard deviation of normal distribution
854
+ *
855
+ * \return
856
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
857
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
858
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
859
+ * a previous kernel launch \n
860
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
861
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
862
+ * not a multiple of the quasirandom dimension, or is not a multiple
863
+ * of two for pseudorandom generators \n
864
+ * - CURAND_STATUS_DOUBLE_PRECISION_REQUIRED if the GPU does not support double precision \n
865
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
866
+ */
867
+ curandStatus_t CURANDAPI
868
+ curandGenerateLogNormalDouble(curandGenerator_t generator, double *outputPtr,
869
+ size_t n, double mean, double stddev);
870
+
871
+ /**
872
+ * \brief Construct the histogram array for a Poisson distribution.
873
+ *
874
+ * Construct the histogram array for the Poisson distribution with lambda \p lambda.
875
+ * For lambda greater than 2000, an approximation with a normal distribution is used.
876
+ *
877
+ * \param lambda - lambda for the Poisson distribution
878
+ *
879
+ *
880
+ * \param discrete_distribution - pointer to the histogram in device memory
881
+ *
882
+ * \return
883
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
884
+ * - CURAND_STATUS_DOUBLE_PRECISION_REQUIRED if the GPU does not support double precision \n
885
+ * - CURAND_STATUS_INITIALIZATION_FAILED if there was a problem setting up the GPU \n
886
+ * - CURAND_STATUS_NOT_INITIALIZED if the distribution pointer was null \n
887
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
888
+ * a previous kernel launch \n
889
+ * - CURAND_STATUS_OUT_OF_RANGE if lambda is non-positive or greater than 400,000 \n
890
+ * - CURAND_STATUS_SUCCESS if the histogram was generated successfully \n
891
+ */
892
+
893
+ curandStatus_t CURANDAPI
894
+ curandCreatePoissonDistribution(double lambda, curandDiscreteDistribution_t *discrete_distribution);
895
+
896
+
897
+
898
+ /**
899
+ * \brief Destroy the histogram array for a discrete distribution (e.g. Poisson).
900
+ *
901
+ * Destroy the histogram array for a discrete distribution created by curandCreatePoissonDistribution.
902
+ *
903
+ * \param discrete_distribution - pointer to device memory where the histogram is stored
904
+ *
905
+ * \return
906
+ * - CURAND_STATUS_NOT_INITIALIZED if the histogram was never created \n
907
+ * - CURAND_STATUS_SUCCESS if the histogram was destroyed successfully \n
908
+ */
909
+ curandStatus_t CURANDAPI
910
+ curandDestroyDistribution(curandDiscreteDistribution_t discrete_distribution);
911
+
912
+
913
+ /**
914
+ * \brief Generate Poisson-distributed unsigned ints.
915
+ *
916
+ * Use \p generator to generate \p n unsigned int results into device memory at
917
+ * \p outputPtr. The device memory must have been previously allocated and must be
918
+ * large enough to hold all the results. Launches are done with the stream
919
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
920
+ *
921
+ * Results are 32-bit unsigned int point values with Poisson distribution, with lambda \p lambda.
922
+ *
923
+ * \param generator - Generator to use
924
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
925
+ * Pointer to host memory to store CPU-generated results
926
+ * \param n - Number of unsigned ints to generate
927
+ * \param lambda - lambda for the Poisson distribution
928
+ *
929
+ * \return
930
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
931
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
932
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
933
+ * a previous kernel launch \n
934
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
935
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
936
+ * not a multiple of the quasirandom dimension\n
937
+ * - CURAND_STATUS_DOUBLE_PRECISION_REQUIRED if the GPU or sm does not support double precision \n
938
+ * - CURAND_STATUS_OUT_OF_RANGE if lambda is non-positive or greater than 400,000 \n
939
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
940
+ */
941
+
942
+ curandStatus_t CURANDAPI
943
+ curandGeneratePoisson(curandGenerator_t generator, unsigned int *outputPtr,
944
+ size_t n, double lambda);
945
+ // just for internal usage
946
+ curandStatus_t CURANDAPI
947
+ curandGeneratePoissonMethod(curandGenerator_t generator, unsigned int *outputPtr,
948
+ size_t n, double lambda, curandMethod_t method);
949
+
950
+
951
+ curandStatus_t CURANDAPI
952
+ curandGenerateBinomial(curandGenerator_t generator, unsigned int *outputPtr,
953
+ size_t num, unsigned int n, double p);
954
+ // just for internal usage
955
+ curandStatus_t CURANDAPI
956
+ curandGenerateBinomialMethod(curandGenerator_t generator,
957
+ unsigned int *outputPtr,
958
+ size_t num, unsigned int n, double p,
959
+ curandMethod_t method);
960
+
961
+
962
+ /**
963
+ * \brief Setup starting states.
964
+ *
965
+ * Generate the starting state of the generator. This function is
966
+ * automatically called by generation functions such as
967
+ * ::curandGenerate() and ::curandGenerateUniform().
968
+ * It can be called manually for performance testing reasons to separate
969
+ * timings for starting state generation and random number generation.
970
+ *
971
+ * \param generator - Generator to update
972
+ *
973
+ * \return
974
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
975
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
976
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
977
+ * a previous kernel launch \n
978
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
979
+ * - CURAND_STATUS_SUCCESS if the seeds were generated successfully \n
980
+ */
981
+ curandStatus_t CURANDAPI
982
+ curandGenerateSeeds(curandGenerator_t generator);
983
+
984
+ /**
985
+ * \brief Get direction vectors for 32-bit quasirandom number generation.
986
+ *
987
+ * Get a pointer to an array of direction vectors that can be used
988
+ * for quasirandom number generation. The resulting pointer will
989
+ * reference an array of direction vectors in host memory.
990
+ *
991
+ * The array contains vectors for many dimensions. Each dimension
992
+ * has 32 vectors. Each individual vector is an unsigned int.
993
+ *
994
+ * Legal values for \p set are:
995
+ * - CURAND_DIRECTION_VECTORS_32_JOEKUO6 (20,000 dimensions)
996
+ * - CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 (20,000 dimensions)
997
+ *
998
+ * \param vectors - Address of pointer in which to return direction vectors
999
+ * \param set - Which set of direction vectors to use
1000
+ *
1001
+ * \return
1002
+ * - CURAND_STATUS_OUT_OF_RANGE if the choice of set is invalid \n
1003
+ * - CURAND_STATUS_SUCCESS if the pointer was set successfully \n
1004
+ */
1005
+ curandStatus_t CURANDAPI
1006
+ curandGetDirectionVectors32(curandDirectionVectors32_t *vectors[], curandDirectionVectorSet_t set);
1007
+
1008
+ /**
1009
+ * \brief Get scramble constants for 32-bit scrambled Sobol' .
1010
+ *
1011
+ * Get a pointer to an array of scramble constants that can be used
1012
+ * for quasirandom number generation. The resulting pointer will
1013
+ * reference an array of unsinged ints in host memory.
1014
+ *
1015
+ * The array contains constants for many dimensions. Each dimension
1016
+ * has a single unsigned int constant.
1017
+ *
1018
+ * \param constants - Address of pointer in which to return scramble constants
1019
+ *
1020
+ * \return
1021
+ * - CURAND_STATUS_SUCCESS if the pointer was set successfully \n
1022
+ */
1023
+ curandStatus_t CURANDAPI
1024
+ curandGetScrambleConstants32(unsigned int * * constants);
1025
+
1026
+ /**
1027
+ * \brief Get direction vectors for 64-bit quasirandom number generation.
1028
+ *
1029
+ * Get a pointer to an array of direction vectors that can be used
1030
+ * for quasirandom number generation. The resulting pointer will
1031
+ * reference an array of direction vectors in host memory.
1032
+ *
1033
+ * The array contains vectors for many dimensions. Each dimension
1034
+ * has 64 vectors. Each individual vector is an unsigned long long.
1035
+ *
1036
+ * Legal values for \p set are:
1037
+ * - CURAND_DIRECTION_VECTORS_64_JOEKUO6 (20,000 dimensions)
1038
+ * - CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 (20,000 dimensions)
1039
+ *
1040
+ * \param vectors - Address of pointer in which to return direction vectors
1041
+ * \param set - Which set of direction vectors to use
1042
+ *
1043
+ * \return
1044
+ * - CURAND_STATUS_OUT_OF_RANGE if the choice of set is invalid \n
1045
+ * - CURAND_STATUS_SUCCESS if the pointer was set successfully \n
1046
+ */
1047
+ curandStatus_t CURANDAPI
1048
+ curandGetDirectionVectors64(curandDirectionVectors64_t *vectors[], curandDirectionVectorSet_t set);
1049
+
1050
+ /**
1051
+ * \brief Get scramble constants for 64-bit scrambled Sobol' .
1052
+ *
1053
+ * Get a pointer to an array of scramble constants that can be used
1054
+ * for quasirandom number generation. The resulting pointer will
1055
+ * reference an array of unsinged long longs in host memory.
1056
+ *
1057
+ * The array contains constants for many dimensions. Each dimension
1058
+ * has a single unsigned long long constant.
1059
+ *
1060
+ * \param constants - Address of pointer in which to return scramble constants
1061
+ *
1062
+ * \return
1063
+ * - CURAND_STATUS_SUCCESS if the pointer was set successfully \n
1064
+ */
1065
+ curandStatus_t CURANDAPI
1066
+ curandGetScrambleConstants64(unsigned long long * * constants);
1067
+
1068
+ /** @} */
1069
+
1070
+ #endif // __CUDACC_RTC__
1071
+
1072
+ #if defined(__cplusplus)
1073
+ }
1074
+ #endif /* __cplusplus */
1075
+
1076
+
1077
+ #endif /* !defined(CURAND_H_) */
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+
49
+ #if !defined(CURANDDISCRETE_H_)
50
+ #define CURANDDISCRETE_H_
51
+
52
+ struct curandDistributionShift_st {
53
+ curandDistribution_t probability;
54
+ curandDistribution_t host_probability;
55
+ unsigned int shift;
56
+ unsigned int length;
57
+ unsigned int host_gen;
58
+ };
59
+
60
+ struct curandHistogramM2_st {
61
+ curandHistogramM2V_t V;
62
+ curandHistogramM2V_t host_V;
63
+ curandHistogramM2K_t K;
64
+ curandHistogramM2K_t host_K;
65
+ unsigned int host_gen;
66
+ };
67
+
68
+
69
+ struct curandDistributionM2Shift_st {
70
+ curandHistogramM2_t histogram;
71
+ curandHistogramM2_t host_histogram;
72
+ unsigned int shift;
73
+ unsigned int length;
74
+ unsigned int host_gen;
75
+ };
76
+
77
+ struct curandDiscreteDistribution_st {
78
+ curandDiscreteDistribution_t self_host_ptr;
79
+ curandDistributionM2Shift_t M2;
80
+ curandDistributionM2Shift_t host_M2;
81
+ double stddev;
82
+ double mean;
83
+ curandMethod_t method;
84
+ unsigned int host_gen;
85
+ };
86
+
87
+ #endif // !defined(CURANDDISCRETE_H_)
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * The source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * The Licensed Deliverables contained herein are PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+
51
+ #if !defined(CURAND_DISCRETE_H_)
52
+ #define CURAND_DISCRETE_H_
53
+
54
+ /**
55
+ * \defgroup DEVICE Device API
56
+ *
57
+ * @{
58
+ */
59
+
60
+ #ifndef __CUDACC_RTC__
61
+ #include <math.h>
62
+ #endif // __CUDACC_RTC__
63
+
64
+ #include "curand_mrg32k3a.h"
65
+ #include "curand_mtgp32_kernel.h"
66
+ #include "curand_philox4x32_x.h"
67
+
68
+
69
+ template <typename T>
70
+ QUALIFIERS unsigned int _curand_discrete(T x, curandDiscreteDistribution_t discrete_distribution){
71
+ if (discrete_distribution->method == CURAND_M2){
72
+ return _curand_M2_double(x, discrete_distribution->M2);
73
+ }
74
+ return (unsigned int)((discrete_distribution->stddev * _curand_normal_icdf_double(x)) + discrete_distribution->mean + 0.5);
75
+ }
76
+
77
+
78
+ template <typename STATE>
79
+ QUALIFIERS unsigned int curand__discrete(STATE state, curandDiscreteDistribution_t discrete_distribution){
80
+ if (discrete_distribution->method == CURAND_M2){
81
+ return curand_M2_double(state, discrete_distribution->M2);
82
+ }
83
+ return (unsigned int)((discrete_distribution->stddev * curand_normal_double(state)) + discrete_distribution->mean + 0.5); //Round to nearest
84
+ }
85
+
86
+ template <typename STATE>
87
+ QUALIFIERS uint4 curand__discrete4(STATE state, curandDiscreteDistribution_t discrete_distribution){
88
+ if (discrete_distribution->method == CURAND_M2){
89
+ return curand_M2_double4(state, discrete_distribution->M2);
90
+ }
91
+ double4 _res;
92
+ uint4 result;
93
+ _res = curand_normal4_double(state);
94
+ result.x = (unsigned int)((discrete_distribution->stddev * _res.x) + discrete_distribution->mean + 0.5); //Round to nearest
95
+ result.y = (unsigned int)((discrete_distribution->stddev * _res.y) + discrete_distribution->mean + 0.5); //Round to nearest
96
+ result.z = (unsigned int)((discrete_distribution->stddev * _res.z) + discrete_distribution->mean + 0.5); //Round to nearest
97
+ result.w = (unsigned int)((discrete_distribution->stddev * _res.w) + discrete_distribution->mean + 0.5); //Round to nearest
98
+ return result;
99
+ }
100
+
101
+ /*
102
+ * \brief Return a discrete distributed unsigned int from a XORWOW generator.
103
+ *
104
+ * Return a single discrete distributed unsigned int derived from a
105
+ * distribution defined by \p discrete_distribution from the XORWOW generator in \p state,
106
+ * increment position of generator by one.
107
+ *
108
+ * \param state - Pointer to state to update
109
+ * \param discrete_distribution - ancillary structure for discrete distribution
110
+ *
111
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
112
+ */
113
+ QUALIFIERS unsigned int curand_discrete(curandStateXORWOW_t *state, curandDiscreteDistribution_t discrete_distribution)
114
+ {
115
+ return curand__discrete(state, discrete_distribution);
116
+ }
117
+
118
+ /*
119
+ * \brief Return a discrete distributed unsigned int from a Philox4_32_10 generator.
120
+ *
121
+ * Return a single discrete distributed unsigned int derived from a
122
+ * distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
123
+ * increment position of generator by one.
124
+ *
125
+ * \param state - Pointer to state to update
126
+ * \param discrete_distribution - ancillary structure for discrete distribution
127
+ *
128
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
129
+ */
130
+ QUALIFIERS unsigned int curand_discrete(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
131
+ {
132
+ return curand__discrete(state, discrete_distribution);
133
+ }
134
+
135
+ /*
136
+ * \brief Return four discrete distributed unsigned ints from a Philox4_32_10 generator.
137
+ *
138
+ * Return four single discrete distributed unsigned ints derived from a
139
+ * distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
140
+ * increment position of generator by one.
141
+ *
142
+ * \param state - Pointer to state to update
143
+ * \param discrete_distribution - ancillary structure for discrete distribution
144
+ *
145
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
146
+ */
147
+ QUALIFIERS uint4 curand_discrete4(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
148
+ {
149
+ return curand__discrete4(state, discrete_distribution);
150
+ }
151
+ /*
152
+ * \brief Return a discrete distributed unsigned int from a MRG32k3a generator.
153
+ *
154
+ * Re turn a single discrete distributed unsigned int derived from a
155
+ * distribution defined by \p discrete_distribution from the MRG32k3a generator in \p state,
156
+ * increment position of generator by one.
157
+ *
158
+ * \param state - Pointer to state to update
159
+ * \param discrete_distribution - ancillary structure for discrete distribution
160
+ *
161
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
162
+ */
163
+ QUALIFIERS unsigned int curand_discrete(curandStateMRG32k3a_t *state, curandDiscreteDistribution_t discrete_distribution)
164
+ {
165
+ return curand__discrete(state, discrete_distribution);
166
+ }
167
+
168
+ /*
169
+ * \brief Return a discrete distributed unsigned int from a MTGP32 generator.
170
+ *
171
+ * Return a single discrete distributed unsigned int derived from a
172
+ * distribution defined by \p discrete_distribution from the MTGP32 generator in \p state,
173
+ * increment position of generator by one.
174
+ *
175
+ * \param state - Pointer to state to update
176
+ * \param discrete_distribution - ancillary structure for discrete distribution
177
+ *
178
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
179
+ */
180
+ QUALIFIERS unsigned int curand_discrete(curandStateMtgp32_t *state, curandDiscreteDistribution_t discrete_distribution)
181
+ {
182
+ return curand__discrete(state, discrete_distribution);
183
+ }
184
+
185
+ /*
186
+ * \brief Return a discrete distributed unsigned int from a Sobol32 generator.
187
+ *
188
+ * Return a single discrete distributed unsigned int derived from a
189
+ * distribution defined by \p discrete_distribution from the Sobol32 generator in \p state,
190
+ * increment position of generator by one.
191
+ *
192
+ * \param state - Pointer to state to update
193
+ * \param discrete_distribution - ancillary structure for discrete distribution
194
+ *
195
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
196
+ */
197
+ QUALIFIERS unsigned int curand_discrete(curandStateSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
198
+ {
199
+ return curand__discrete(state, discrete_distribution);
200
+ }
201
+
202
+ /*
203
+ * \brief Return a discrete distributed unsigned int from a scrambled Sobol32 generator.
204
+ *
205
+ * Return a single discrete distributed unsigned int derived from a
206
+ * distribution defined by \p discrete_distribution from the scrambled Sobol32 generator in \p state,
207
+ * increment position of generator by one.
208
+ *
209
+ * \param state - Pointer to state to update
210
+ * \param discrete_distribution - ancillary structure for discrete distribution
211
+ *
212
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
213
+ */
214
+ QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
215
+ {
216
+ return curand__discrete(state, discrete_distribution);
217
+ }
218
+
219
+ /*
220
+ * \brief Return a discrete distributed unsigned int from a Sobol64 generator.
221
+ *
222
+ * Return a single discrete distributed unsigned int derived from a
223
+ * distribution defined by \p discrete_distribution from the Sobol64 generator in \p state,
224
+ * increment position of generator by one.
225
+ *
226
+ * \param state - Pointer to state to update
227
+ * \param discrete_distribution - ancillary structure for discrete distribution
228
+ *
229
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
230
+ */
231
+ QUALIFIERS unsigned int curand_discrete(curandStateSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
232
+ {
233
+ return curand__discrete(state, discrete_distribution);
234
+ }
235
+
236
+ /*
237
+ * \brief Return a discrete distributed unsigned int from a scrambled Sobol64 generator.
238
+ *
239
+ * Return a single discrete distributed unsigned int derived from a
240
+ * distribution defined by \p discrete_distribution from the scrambled Sobol64 generator in \p state,
241
+ * increment position of generator by one.
242
+ *
243
+ * \param state - Pointer to state to update
244
+ * \param discrete_distribution - ancillary structure for discrete distribution
245
+ *
246
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
247
+ */
248
+ QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
249
+ {
250
+ return curand__discrete(state, discrete_distribution);
251
+ }
252
+
253
+ #endif // !defined(CURAND_DISCRETE_H_)
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_globals.h ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+ #ifndef CURAND_GLOBALS_H
49
+ #define CURAND_GLOBALS_H
50
+
51
+ #define MAX_XOR_N (5)
52
+ #define SKIPAHEAD_BLOCKSIZE (4)
53
+ #define SKIPAHEAD_MASK ((1<<SKIPAHEAD_BLOCKSIZE)-1)
54
+ #define CURAND_2POW32 (4294967296.f)
55
+ #define CURAND_2POW32_DOUBLE (4294967296.)
56
+ #define CURAND_2POW32_INV (2.3283064e-10f)
57
+ #define CURAND_2POW32_INV_DOUBLE (2.3283064365386963e-10)
58
+ #define CURAND_2POW53_INV_DOUBLE (1.1102230246251565e-16)
59
+ #define CURAND_2POW32_INV_2PI (2.3283064e-10f * 6.2831855f)
60
+ #define CURAND_2PI (6.2831855f)
61
+ #define CURAND_2POW53_INV_2PI_DOUBLE (1.1102230246251565e-16 * 6.2831853071795860)
62
+ #define CURAND_PI_DOUBLE (3.1415926535897932)
63
+ #define CURAND_2PI_DOUBLE (6.2831853071795860)
64
+ #define CURAND_SQRT2 (-1.4142135f)
65
+ #define CURAND_SQRT2_DOUBLE (-1.4142135623730951)
66
+
67
+ #define SOBOL64_ITR_BINARY_DIVIDE 2
68
+ #define SOBOL_M2_BINARY_DIVIDE 10
69
+ #define MTGP32_M2_BINARY_DIVIDE 32
70
+ #define MAX_LAMBDA 400000
71
+ #define MIN_GAUSS_LAMBDA 2000
72
+
73
+ struct normal_args_st {
74
+ float mean;
75
+ float stddev;
76
+ };
77
+
78
+ typedef struct normal_args_st normal_args_t;
79
+
80
+ struct normal_args_double_st {
81
+ double mean;
82
+ double stddev;
83
+ };
84
+
85
+ typedef struct normal_args_double_st normal_args_double_t;
86
+
87
+
88
+
89
+
90
+
91
+
92
+
93
+ #endif
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_kernel.h ADDED
@@ -0,0 +1,1677 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * The source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * The Licensed Deliverables contained herein are PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+
51
+ #if !defined(CURAND_KERNEL_H_)
52
+ #define CURAND_KERNEL_H_
53
+
54
+ /**
55
+ * \defgroup DEVICE Device API
56
+ *
57
+ * @{
58
+ */
59
+
60
+ #if !defined(QUALIFIERS)
61
+ #define QUALIFIERS static __forceinline__ __device__
62
+ #endif
63
+
64
+ /* To prevent unused parameter warnings */
65
+ #if !defined(GCC_UNUSED_PARAMETER)
66
+ #if defined(__GNUC__)
67
+ #define GCC_UNUSED_PARAMETER __attribute__((unused))
68
+ #else
69
+ #define GCC_UNUSED_PARAMETER
70
+ #endif /* defined(__GNUC__) */
71
+ #endif /* !defined(GCC_UNUSED_PARAMETER) */
72
+
73
+ #include <nv/target>
74
+
75
+ #ifdef __CUDACC_RTC__
76
+ #define CURAND_DETAIL_USE_CUDA_STL
77
+ #endif
78
+
79
+ #if __cplusplus >= 201103L
80
+ # ifdef CURAND_DETAIL_USE_CUDA_STL
81
+ # define CURAND_STD cuda::std
82
+ # include <cuda/std/type_traits>
83
+ # else
84
+ # define CURAND_STD std
85
+ # include <type_traits>
86
+ # endif // CURAND_DETAIL_USE_CUDA_STL
87
+ #else
88
+ // To support C++03 compilation
89
+ # define CURAND_STD curand_detail
90
+ namespace curand_detail {
91
+ template<bool B, class T = void>
92
+ struct enable_if {};
93
+
94
+ template<class T>
95
+ struct enable_if<true, T> { typedef T type; };
96
+
97
+ template<class T, class U>
98
+ struct is_same { static const bool value = false; };
99
+
100
+ template<class T>
101
+ struct is_same<T, T> { static const bool value = true; };
102
+ } // namespace curand_detail
103
+ #endif // __cplusplus >= 201103L
104
+
105
+ #ifndef __CUDACC_RTC__
106
+ #include <math.h>
107
+ #endif // __CUDACC_RTC__
108
+
109
+ #include "curand.h"
110
+ #include "curand_discrete.h"
111
+ #include "curand_precalc.h"
112
+ #include "curand_mrg32k3a.h"
113
+ #include "curand_mtgp32_kernel.h"
114
+ #include "curand_philox4x32_x.h"
115
+ #include "curand_globals.h"
116
+
117
+ /* Test RNG */
118
+ /* This generator uses the formula:
119
+ x_n = x_(n-1) + 1 mod 2^32
120
+ x_0 = (unsigned int)seed * 3
121
+ Subsequences are spaced 31337 steps apart.
122
+ */
123
+ struct curandStateTest {
124
+ unsigned int v;
125
+ };
126
+
127
+ /** \cond UNHIDE_TYPEDEFS */
128
+ typedef struct curandStateTest curandStateTest_t;
129
+ /** \endcond */
130
+
131
+ /* XORSHIFT FAMILY RNGs */
132
+ /* These generators are a family proposed by Marsaglia. They keep state
133
+ in 32 bit chunks, then use repeated shift and xor operations to scramble
134
+ the bits. The following generators are a combination of a simple Weyl
135
+ generator with an N variable XORSHIFT generator.
136
+ */
137
+
138
+ /* XORSHIFT RNG */
139
+ /* This generator uses the xorwow formula of
140
+ www.jstatsoft.org/v08/i14/paper page 5
141
+ Has period 2^192 - 2^32.
142
+ */
143
+ /**
144
+ * CURAND XORWOW state
145
+ */
146
+ struct curandStateXORWOW;
147
+
148
+ /*
149
+ * Implementation details not in reference documentation */
150
+ struct curandStateXORWOW {
151
+ unsigned int d, v[5];
152
+ int boxmuller_flag;
153
+ int boxmuller_flag_double;
154
+ float boxmuller_extra;
155
+ double boxmuller_extra_double;
156
+ };
157
+
158
+ /*
159
+ * CURAND XORWOW state
160
+ */
161
+ /** \cond UNHIDE_TYPEDEFS */
162
+ typedef struct curandStateXORWOW curandStateXORWOW_t;
163
+
164
+ #define EXTRA_FLAG_NORMAL 0x00000001
165
+ #define EXTRA_FLAG_LOG_NORMAL 0x00000002
166
+ /** \endcond */
167
+
168
+ /* Combined Multiple Recursive Generators */
169
+ /* These generators are a family proposed by L'Ecuyer. They keep state
170
+ in sets of doubles, then use repeated modular arithmetic multiply operations
171
+ to scramble the bits in each set, and combine the result.
172
+ */
173
+
174
+ /* MRG32k3a RNG */
175
+ /* This generator uses the MRG32k3A formula of
176
+ http://www.iro.umontreal.ca/~lecuyer/myftp/streams00/c++/streams4.pdf
177
+ Has period 2^191.
178
+ */
179
+
180
+ /* moduli for the recursions */
181
+ /** \cond UNHIDE_DEFINES */
182
+ #define MRG32K3A_MOD1 4294967087.
183
+ #define MRG32K3A_MOD2 4294944443.
184
+
185
+ /* Constants used in generation */
186
+
187
+ #define MRG32K3A_A12 1403580.
188
+ #define MRG32K3A_A13N 810728.
189
+ #define MRG32K3A_A21 527612.
190
+ #define MRG32K3A_A23N 1370589.
191
+ #define MRG32K3A_NORM (2.3283065498378288e-10)
192
+ //
193
+ // #define MRG32K3A_BITS_NORM ((double)((POW32_DOUBLE-1.0)/MOD1))
194
+ // above constant, used verbatim, rounds differently on some host systems.
195
+ #define MRG32K3A_BITS_NORM 1.000000048662
196
+
197
+ /** \endcond */
198
+
199
+
200
+
201
+
202
+ /**
203
+ * CURAND MRG32K3A state
204
+ */
205
+ struct curandStateMRG32k3a;
206
+
207
+ /* Implementation details not in reference documentation */
208
+ struct curandStateMRG32k3a {
209
+ unsigned int s1[3];
210
+ unsigned int s2[3];
211
+ int boxmuller_flag;
212
+ int boxmuller_flag_double;
213
+ float boxmuller_extra;
214
+ double boxmuller_extra_double;
215
+ };
216
+
217
+ /*
218
+ * CURAND MRG32K3A state
219
+ */
220
+ /** \cond UNHIDE_TYPEDEFS */
221
+ typedef struct curandStateMRG32k3a curandStateMRG32k3a_t;
222
+ /** \endcond */
223
+
224
+ /* SOBOL QRNG */
225
+ /**
226
+ * CURAND Sobol32 state
227
+ */
228
+ struct curandStateSobol32;
229
+
230
+ /* Implementation details not in reference documentation */
231
+ struct curandStateSobol32 {
232
+ unsigned int i, x, c;
233
+ unsigned int direction_vectors[32];
234
+ };
235
+
236
+ /*
237
+ * CURAND Sobol32 state
238
+ */
239
+ /** \cond UNHIDE_TYPEDEFS */
240
+ typedef struct curandStateSobol32 curandStateSobol32_t;
241
+ /** \endcond */
242
+
243
+ /**
244
+ * CURAND Scrambled Sobol32 state
245
+ */
246
+ struct curandStateScrambledSobol32;
247
+
248
+ /* Implementation details not in reference documentation */
249
+ struct curandStateScrambledSobol32 {
250
+ unsigned int i, x, c;
251
+ unsigned int direction_vectors[32];
252
+ };
253
+
254
+ /*
255
+ * CURAND Scrambled Sobol32 state
256
+ */
257
+ /** \cond UNHIDE_TYPEDEFS */
258
+ typedef struct curandStateScrambledSobol32 curandStateScrambledSobol32_t;
259
+ /** \endcond */
260
+
261
+ /**
262
+ * CURAND Sobol64 state
263
+ */
264
+ struct curandStateSobol64;
265
+
266
+ /* Implementation details not in reference documentation */
267
+ struct curandStateSobol64 {
268
+ unsigned long long i, x, c;
269
+ unsigned long long direction_vectors[64];
270
+ };
271
+
272
+ /*
273
+ * CURAND Sobol64 state
274
+ */
275
+ /** \cond UNHIDE_TYPEDEFS */
276
+ typedef struct curandStateSobol64 curandStateSobol64_t;
277
+ /** \endcond */
278
+
279
+ /**
280
+ * CURAND Scrambled Sobol64 state
281
+ */
282
+ struct curandStateScrambledSobol64;
283
+
284
+ /* Implementation details not in reference documentation */
285
+ struct curandStateScrambledSobol64 {
286
+ unsigned long long i, x, c;
287
+ unsigned long long direction_vectors[64];
288
+ };
289
+
290
+ /*
291
+ * CURAND Scrambled Sobol64 state
292
+ */
293
+ /** \cond UNHIDE_TYPEDEFS */
294
+ typedef struct curandStateScrambledSobol64 curandStateScrambledSobol64_t;
295
+ /** \endcond */
296
+
297
+ /*
298
+ * Default RNG
299
+ */
300
+ /** \cond UNHIDE_TYPEDEFS */
301
+ typedef struct curandStateXORWOW curandState_t;
302
+ typedef struct curandStateXORWOW curandState;
303
+ /** \endcond */
304
+
305
+ /****************************************************************************/
306
+ /* Utility functions needed by RNGs */
307
+ /****************************************************************************/
308
+ /** \cond UNHIDE_UTILITIES */
309
+ /*
310
+ multiply vector by matrix, store in result
311
+ matrix is n x n, measured in 32 bit units
312
+ matrix is stored in row major order
313
+ vector and result cannot be same pointer
314
+ */
315
+ template<int N>
316
+ QUALIFIERS void __curand_matvec_inplace(unsigned int *vector, unsigned int *matrix)
317
+ {
318
+ unsigned int result[N] = { 0 };
319
+ for(int i = 0; i < N; i++) {
320
+ #ifdef __CUDA_ARCH__
321
+ #pragma unroll 16
322
+ #endif
323
+ for(int j = 0; j < 32; j++) {
324
+ if(vector[i] & (1 << j)) {
325
+ for(int k = 0; k < N; k++) {
326
+ result[k] ^= matrix[N * (i * 32 + j) + k];
327
+ }
328
+ }
329
+ }
330
+ }
331
+ for(int i = 0; i < N; i++) {
332
+ vector[i] = result[i];
333
+ }
334
+ }
335
+
336
+ QUALIFIERS void __curand_matvec(unsigned int *vector, unsigned int *matrix,
337
+ unsigned int *result, int n)
338
+ {
339
+ for(int i = 0; i < n; i++) {
340
+ result[i] = 0;
341
+ }
342
+ for(int i = 0; i < n; i++) {
343
+ for(int j = 0; j < 32; j++) {
344
+ if(vector[i] & (1 << j)) {
345
+ for(int k = 0; k < n; k++) {
346
+ result[k] ^= matrix[n * (i * 32 + j) + k];
347
+ }
348
+ }
349
+ }
350
+ }
351
+ }
352
+
353
+ /* generate identity matrix */
354
+ QUALIFIERS void __curand_matidentity(unsigned int *matrix, int n)
355
+ {
356
+ int r;
357
+ for(int i = 0; i < n * 32; i++) {
358
+ for(int j = 0; j < n; j++) {
359
+ r = i & 31;
360
+ if(i / 32 == j) {
361
+ matrix[i * n + j] = (1 << r);
362
+ } else {
363
+ matrix[i * n + j] = 0;
364
+ }
365
+ }
366
+ }
367
+ }
368
+
369
+ /* multiply matrixA by matrixB, store back in matrixA
370
+ matrixA and matrixB must not be same matrix */
371
+ QUALIFIERS void __curand_matmat(unsigned int *matrixA, unsigned int *matrixB, int n)
372
+ {
373
+ unsigned int result[MAX_XOR_N];
374
+ for(int i = 0; i < n * 32; i++) {
375
+ __curand_matvec(matrixA + i * n, matrixB, result, n);
376
+ for(int j = 0; j < n; j++) {
377
+ matrixA[i * n + j] = result[j];
378
+ }
379
+ }
380
+ }
381
+
382
+ /* copy vectorA to vector */
383
+ QUALIFIERS void __curand_veccopy(unsigned int *vector, unsigned int *vectorA, int n)
384
+ {
385
+ for(int i = 0; i < n; i++) {
386
+ vector[i] = vectorA[i];
387
+ }
388
+ }
389
+
390
+ /* copy matrixA to matrix */
391
+ QUALIFIERS void __curand_matcopy(unsigned int *matrix, unsigned int *matrixA, int n)
392
+ {
393
+ for(int i = 0; i < n * n * 32; i++) {
394
+ matrix[i] = matrixA[i];
395
+ }
396
+ }
397
+
398
+ /* compute matrixA to power p, store result in matrix */
399
+ QUALIFIERS void __curand_matpow(unsigned int *matrix, unsigned int *matrixA,
400
+ unsigned long long p, int n)
401
+ {
402
+ unsigned int matrixR[MAX_XOR_N * MAX_XOR_N * 32];
403
+ unsigned int matrixS[MAX_XOR_N * MAX_XOR_N * 32];
404
+ __curand_matidentity(matrix, n);
405
+ __curand_matcopy(matrixR, matrixA, n);
406
+ while(p) {
407
+ if(p & 1) {
408
+ __curand_matmat(matrix, matrixR, n);
409
+ }
410
+ __curand_matcopy(matrixS, matrixR, n);
411
+ __curand_matmat(matrixR, matrixS, n);
412
+ p >>= 1;
413
+ }
414
+ }
415
+
416
+ /****************************************************************************/
417
+ /* Utility functions needed by MRG32k3a RNG */
418
+ /* Matrix operations modulo some integer less than 2**32, done in */
419
+ /* double precision floating point, with care not to overflow 53 bits */
420
+ /****************************************************************************/
421
+
422
+ /* return i mod m. */
423
+ /* assumes i and m are integers represented accurately in doubles */
424
+
425
+ QUALIFIERS double curand_MRGmod(double i, double m)
426
+ {
427
+ double quo;
428
+ double rem;
429
+ quo = floor(i/m);
430
+ rem = i - (quo*m);
431
+ if (rem < 0.0) rem += m;
432
+ return rem;
433
+ }
434
+
435
+ /* Multiplication modulo m. Inputs i and j less than 2**32 */
436
+ /* Ensure intermediate results do not exceed 2**53 */
437
+
438
+ QUALIFIERS double curand_MRGmodMul(double i, double j, double m)
439
+ {
440
+ double tempHi;
441
+ double tempLo;
442
+
443
+ tempHi = floor(i/131072.0);
444
+ tempLo = i - (tempHi*131072.0);
445
+ tempLo = curand_MRGmod( curand_MRGmod( (tempHi * j), m) * 131072.0 + curand_MRGmod(tempLo * j, m),m);
446
+
447
+ if (tempLo < 0.0) tempLo += m;
448
+ return tempLo;
449
+ }
450
+
451
+ /* multiply 3 by 3 matrices of doubles, modulo m */
452
+
453
+ QUALIFIERS void curand_MRGmatMul3x3(unsigned int i1[][3],unsigned int i2[][3],unsigned int o[][3],double m)
454
+ {
455
+ int i,j;
456
+ double temp[3][3];
457
+ for (i=0; i<3; i++){
458
+ for (j=0; j<3; j++){
459
+ temp[i][j] = ( curand_MRGmodMul(i1[i][0], i2[0][j], m) +
460
+ curand_MRGmodMul(i1[i][1], i2[1][j], m) +
461
+ curand_MRGmodMul(i1[i][2], i2[2][j], m));
462
+ temp[i][j] = curand_MRGmod( temp[i][j], m );
463
+ }
464
+ }
465
+ for (i=0; i<3; i++){
466
+ for (j=0; j<3; j++){
467
+ o[i][j] = (unsigned int)temp[i][j];
468
+ }
469
+ }
470
+ }
471
+
472
+ /* multiply 3 by 3 matrix times 3 by 1 vector of doubles, modulo m */
473
+
474
+ QUALIFIERS void curand_MRGmatVecMul3x3( unsigned int i[][3], unsigned int v[], double m)
475
+ {
476
+ int k;
477
+ double t[3];
478
+ for (k = 0; k < 3; k++) {
479
+ t[k] = ( curand_MRGmodMul(i[k][0], v[0], m) +
480
+ curand_MRGmodMul(i[k][1], v[1], m) +
481
+ curand_MRGmodMul(i[k][2], v[2], m) );
482
+ t[k] = curand_MRGmod( t[k], m );
483
+ }
484
+ for (k = 0; k < 3; k++) {
485
+ v[k] = (unsigned int)t[k];
486
+ }
487
+
488
+ }
489
+
490
+ /* raise a 3 by 3 matrix of doubles to a 64 bit integer power pow, modulo m */
491
+ /* input is index zero of an array of 3 by 3 matrices m, */
492
+ /* each m = m[0]**(2**index) */
493
+
494
+ QUALIFIERS void curand_MRGmatPow3x3( unsigned int in[][3][3], unsigned int o[][3], double m, unsigned long long pow )
495
+ {
496
+ int i,j;
497
+ for ( i = 0; i < 3; i++ ) {
498
+ for ( j = 0; j < 3; j++ ) {
499
+ o[i][j] = 0;
500
+ if ( i == j ) o[i][j] = 1;
501
+ }
502
+ }
503
+ i = 0;
504
+ curand_MRGmatVecMul3x3(o,o[0],m);
505
+ while (pow) {
506
+ if ( pow & 1ll ) {
507
+ curand_MRGmatMul3x3(in[i], o, o, m);
508
+ }
509
+ i++;
510
+ pow >>= 1;
511
+ }
512
+ }
513
+
514
+ /* raise a 3 by 3 matrix of doubles to the power */
515
+ /* 2 to the power (pow modulo 191), modulo m */
516
+
517
+ QUALIFIERS void curnand_MRGmatPow2Pow3x3( double in[][3], double o[][3], double m, unsigned long pow )
518
+ {
519
+ unsigned int temp[3][3];
520
+ int i,j;
521
+ pow = pow % 191;
522
+ for ( i = 0; i < 3; i++ ) {
523
+ for ( j = 0; j < 3; j++ ) {
524
+ temp[i][j] = (unsigned int)in[i][j];
525
+ }
526
+ }
527
+ while (pow) {
528
+ curand_MRGmatMul3x3(temp, temp, temp, m);
529
+ pow--;
530
+ }
531
+ for ( i = 0; i < 3; i++ ) {
532
+ for ( j = 0; j < 3; j++ ) {
533
+ o[i][j] = temp[i][j];
534
+ }
535
+ }
536
+ }
537
+
538
+ /** \endcond */
539
+
540
+ /****************************************************************************/
541
+ /* Kernel implementations of RNGs */
542
+ /****************************************************************************/
543
+
544
+ /* Test RNG */
545
+
546
+ QUALIFIERS void curand_init(unsigned long long seed,
547
+ unsigned long long subsequence,
548
+ unsigned long long offset,
549
+ curandStateTest_t *state)
550
+ {
551
+ state->v = (unsigned int)(seed * 3) + (unsigned int)(subsequence * 31337) + \
552
+ (unsigned int)offset;
553
+ }
554
+
555
+
556
+ QUALIFIERS unsigned int curand(curandStateTest_t *state)
557
+ {
558
+ unsigned int r = state->v++;
559
+ return r;
560
+ }
561
+
562
+ QUALIFIERS void skipahead(unsigned long long n, curandStateTest_t *state)
563
+ {
564
+ state->v += (unsigned int)n;
565
+ }
566
+
567
+ /* XORWOW RNG */
568
+
569
+ template <typename T, int n>
570
+ QUALIFIERS void __curand_generate_skipahead_matrix_xor(unsigned int matrix[])
571
+ {
572
+ T state;
573
+ // Generate matrix that advances one step
574
+ // matrix has n * n * 32 32-bit elements
575
+ // solve for matrix by stepping single bit states
576
+ for(int i = 0; i < 32 * n; i++) {
577
+ state.d = 0;
578
+ for(int j = 0; j < n; j++) {
579
+ state.v[j] = 0;
580
+ }
581
+ state.v[i / 32] = (1 << (i & 31));
582
+ curand(&state);
583
+ for(int j = 0; j < n; j++) {
584
+ matrix[i * n + j] = state.v[j];
585
+ }
586
+ }
587
+ }
588
+
589
+ template <typename T, int n>
590
+ QUALIFIERS void _skipahead_scratch(unsigned long long x, T *state, unsigned int *scratch)
591
+ {
592
+ // unsigned int matrix[n * n * 32];
593
+ unsigned int *matrix = scratch;
594
+ // unsigned int matrixA[n * n * 32];
595
+ unsigned int *matrixA = scratch + (n * n * 32);
596
+ // unsigned int vector[n];
597
+ unsigned int *vector = scratch + (n * n * 32) + (n * n * 32);
598
+ // unsigned int result[n];
599
+ unsigned int *result = scratch + (n * n * 32) + (n * n * 32) + n;
600
+ unsigned long long p = x;
601
+ for(int i = 0; i < n; i++) {
602
+ vector[i] = state->v[i];
603
+ }
604
+ int matrix_num = 0;
605
+ while(p && (matrix_num < PRECALC_NUM_MATRICES - 1)) {
606
+ for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
607
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
608
+ __curand_matvec(vector, precalc_xorwow_offset_matrix[matrix_num], result, n);
609
+ ,
610
+ __curand_matvec(vector, precalc_xorwow_offset_matrix_host[matrix_num], result, n);
611
+ )
612
+ __curand_veccopy(vector, result, n);
613
+ }
614
+ p >>= PRECALC_BLOCK_SIZE;
615
+ matrix_num++;
616
+ }
617
+ if(p) {
618
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
619
+ __curand_matcopy(matrix, precalc_xorwow_offset_matrix[PRECALC_NUM_MATRICES - 1], n);
620
+ __curand_matcopy(matrixA, precalc_xorwow_offset_matrix[PRECALC_NUM_MATRICES - 1], n);
621
+ ,
622
+ __curand_matcopy(matrix, precalc_xorwow_offset_matrix_host[PRECALC_NUM_MATRICES - 1], n);
623
+ __curand_matcopy(matrixA, precalc_xorwow_offset_matrix_host[PRECALC_NUM_MATRICES - 1], n);
624
+ )
625
+ }
626
+ while(p) {
627
+ for(unsigned int t = 0; t < (p & SKIPAHEAD_MASK); t++) {
628
+ __curand_matvec(vector, matrixA, result, n);
629
+ __curand_veccopy(vector, result, n);
630
+ }
631
+ p >>= SKIPAHEAD_BLOCKSIZE;
632
+ if(p) {
633
+ for(int i = 0; i < SKIPAHEAD_BLOCKSIZE; i++) {
634
+ __curand_matmat(matrix, matrixA, n);
635
+ __curand_matcopy(matrixA, matrix, n);
636
+ }
637
+ }
638
+ }
639
+ for(int i = 0; i < n; i++) {
640
+ state->v[i] = vector[i];
641
+ }
642
+ state->d += 362437 * (unsigned int)x;
643
+ }
644
+
645
+ template <typename T, int n>
646
+ QUALIFIERS void _skipahead_sequence_scratch(unsigned long long x, T *state, unsigned int *scratch)
647
+ {
648
+ // unsigned int matrix[n * n * 32];
649
+ unsigned int *matrix = scratch;
650
+ // unsigned int matrixA[n * n * 32];
651
+ unsigned int *matrixA = scratch + (n * n * 32);
652
+ // unsigned int vector[n];
653
+ unsigned int *vector = scratch + (n * n * 32) + (n * n * 32);
654
+ // unsigned int result[n];
655
+ unsigned int *result = scratch + (n * n * 32) + (n * n * 32) + n;
656
+ unsigned long long p = x;
657
+ for(int i = 0; i < n; i++) {
658
+ vector[i] = state->v[i];
659
+ }
660
+ int matrix_num = 0;
661
+ while(p && matrix_num < PRECALC_NUM_MATRICES - 1) {
662
+ for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
663
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
664
+ __curand_matvec(vector, precalc_xorwow_matrix[matrix_num], result, n);
665
+ ,
666
+ __curand_matvec(vector, precalc_xorwow_matrix_host[matrix_num], result, n);
667
+ )
668
+ __curand_veccopy(vector, result, n);
669
+ }
670
+ p >>= PRECALC_BLOCK_SIZE;
671
+ matrix_num++;
672
+ }
673
+ if(p) {
674
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
675
+ __curand_matcopy(matrix, precalc_xorwow_matrix[PRECALC_NUM_MATRICES - 1], n);
676
+ __curand_matcopy(matrixA, precalc_xorwow_matrix[PRECALC_NUM_MATRICES - 1], n);
677
+ ,
678
+ __curand_matcopy(matrix, precalc_xorwow_matrix_host[PRECALC_NUM_MATRICES - 1], n);
679
+ __curand_matcopy(matrixA, precalc_xorwow_matrix_host[PRECALC_NUM_MATRICES - 1], n);
680
+ )
681
+ }
682
+ while(p) {
683
+ for(unsigned int t = 0; t < (p & SKIPAHEAD_MASK); t++) {
684
+ __curand_matvec(vector, matrixA, result, n);
685
+ __curand_veccopy(vector, result, n);
686
+ }
687
+ p >>= SKIPAHEAD_BLOCKSIZE;
688
+ if(p) {
689
+ for(int i = 0; i < SKIPAHEAD_BLOCKSIZE; i++) {
690
+ __curand_matmat(matrix, matrixA, n);
691
+ __curand_matcopy(matrixA, matrix, n);
692
+ }
693
+ }
694
+ }
695
+ for(int i = 0; i < n; i++) {
696
+ state->v[i] = vector[i];
697
+ }
698
+ /* No update of state->d needed, guaranteed to be a multiple of 2^32 */
699
+ }
700
+
701
+ template <typename T, int N>
702
+ QUALIFIERS void _skipahead_inplace(const unsigned long long x, T *state)
703
+ {
704
+ unsigned long long p = x;
705
+ int matrix_num = 0;
706
+ while(p) {
707
+ for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
708
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
709
+ __curand_matvec_inplace<N>(state->v, precalc_xorwow_offset_matrix[matrix_num]);
710
+ ,
711
+ __curand_matvec_inplace<N>(state->v, precalc_xorwow_offset_matrix_host[matrix_num]);
712
+ )
713
+ }
714
+ p >>= PRECALC_BLOCK_SIZE;
715
+ matrix_num++;
716
+ }
717
+ state->d += 362437 * (unsigned int)x;
718
+ }
719
+
720
+ template <typename T, int N>
721
+ QUALIFIERS void _skipahead_sequence_inplace(unsigned long long x, T *state)
722
+ {
723
+ int matrix_num = 0;
724
+ while(x) {
725
+ for(unsigned int t = 0; t < (x & PRECALC_BLOCK_MASK); t++) {
726
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
727
+ __curand_matvec_inplace<N>(state->v, precalc_xorwow_matrix[matrix_num]);
728
+ ,
729
+ __curand_matvec_inplace<N>(state->v, precalc_xorwow_matrix_host[matrix_num]);
730
+ )
731
+ }
732
+ x >>= PRECALC_BLOCK_SIZE;
733
+ matrix_num++;
734
+ }
735
+ /* No update of state->d needed, guaranteed to be a multiple of 2^32 */
736
+ }
737
+
738
+ /**
739
+ * \brief Update XORWOW state to skip \p n elements.
740
+ *
741
+ * Update the XORWOW state in \p state to skip ahead \p n elements.
742
+ *
743
+ * All values of \p n are valid. Large values require more computation and so
744
+ * will take more time to complete.
745
+ *
746
+ * \param n - Number of elements to skip
747
+ * \param state - Pointer to state to update
748
+ */
749
+ QUALIFIERS void skipahead(unsigned long long n, curandStateXORWOW_t *state)
750
+ {
751
+ _skipahead_inplace<curandStateXORWOW_t, 5>(n, state);
752
+ }
753
+
754
+ /**
755
+ * \brief Update XORWOW state to skip ahead \p n subsequences.
756
+ *
757
+ * Update the XORWOW state in \p state to skip ahead \p n subsequences. Each
758
+ * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
759
+ * \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly * n elements.
760
+ *
761
+ * All values of \p n are valid. Large values require more computation and so
762
+ * will take more time to complete.
763
+ *
764
+ * \param n - Number of subsequences to skip
765
+ * \param state - Pointer to state to update
766
+ */
767
+ QUALIFIERS void skipahead_sequence(unsigned long long n, curandStateXORWOW_t *state)
768
+ {
769
+ _skipahead_sequence_inplace<curandStateXORWOW_t, 5>(n, state);
770
+ }
771
+
772
+ QUALIFIERS void _curand_init_scratch(unsigned long long seed,
773
+ unsigned long long subsequence,
774
+ unsigned long long offset,
775
+ curandStateXORWOW_t *state,
776
+ unsigned int *scratch)
777
+ {
778
+ // Break up seed, apply salt
779
+ // Constants are arbitrary nonzero values
780
+ unsigned int s0 = ((unsigned int)seed) ^ 0xaad26b49UL;
781
+ unsigned int s1 = (unsigned int)(seed >> 32) ^ 0xf7dcefddUL;
782
+ // Simple multiplication to mix up bits
783
+ // Constants are arbitrary odd values
784
+ unsigned int t0 = 1099087573UL * s0;
785
+ unsigned int t1 = 2591861531UL * s1;
786
+ state->d = 6615241 + t1 + t0;
787
+ state->v[0] = 123456789UL + t0;
788
+ state->v[1] = 362436069UL ^ t0;
789
+ state->v[2] = 521288629UL + t1;
790
+ state->v[3] = 88675123UL ^ t1;
791
+ state->v[4] = 5783321UL + t0;
792
+ _skipahead_sequence_scratch<curandStateXORWOW_t, 5>(subsequence, state, scratch);
793
+ _skipahead_scratch<curandStateXORWOW_t, 5>(offset, state, scratch);
794
+ state->boxmuller_flag = 0;
795
+ state->boxmuller_flag_double = 0;
796
+ state->boxmuller_extra = 0.f;
797
+ state->boxmuller_extra_double = 0.;
798
+ }
799
+
800
+ QUALIFIERS void _curand_init_inplace(unsigned long long seed,
801
+ unsigned long long subsequence,
802
+ unsigned long long offset,
803
+ curandStateXORWOW_t *state)
804
+ {
805
+ // Break up seed, apply salt
806
+ // Constants are arbitrary nonzero values
807
+ unsigned int s0 = ((unsigned int)seed) ^ 0xaad26b49UL;
808
+ unsigned int s1 = (unsigned int)(seed >> 32) ^ 0xf7dcefddUL;
809
+ // Simple multiplication to mix up bits
810
+ // Constants are arbitrary odd values
811
+ unsigned int t0 = 1099087573UL * s0;
812
+ unsigned int t1 = 2591861531UL * s1;
813
+ state->d = 6615241 + t1 + t0;
814
+ state->v[0] = 123456789UL + t0;
815
+ state->v[1] = 362436069UL ^ t0;
816
+ state->v[2] = 521288629UL + t1;
817
+ state->v[3] = 88675123UL ^ t1;
818
+ state->v[4] = 5783321UL + t0;
819
+ _skipahead_sequence_inplace<curandStateXORWOW_t, 5>(subsequence, state);
820
+ _skipahead_inplace<curandStateXORWOW_t, 5>(offset, state);
821
+ state->boxmuller_flag = 0;
822
+ state->boxmuller_flag_double = 0;
823
+ state->boxmuller_extra = 0.f;
824
+ state->boxmuller_extra_double = 0.;
825
+ }
826
+
827
+ /**
828
+ * \brief Initialize XORWOW state.
829
+ *
830
+ * Initialize XORWOW state in \p state with the given \p seed, \p subsequence,
831
+ * and \p offset.
832
+ *
833
+ * All input values of \p seed, \p subsequence, and \p offset are legal. Large
834
+ * values for \p subsequence and \p offset require more computation and so will
835
+ * take more time to complete.
836
+ *
837
+ * A value of 0 for \p seed sets the state to the values of the original
838
+ * published version of the \p xorwow algorithm.
839
+ *
840
+ * \param seed - Arbitrary bits to use as a seed
841
+ * \param subsequence - Subsequence to start at
842
+ * \param offset - Absolute offset into sequence
843
+ * \param state - Pointer to state to initialize
844
+ */
845
+ QUALIFIERS void curand_init(unsigned long long seed,
846
+ unsigned long long subsequence,
847
+ unsigned long long offset,
848
+ curandStateXORWOW_t *state)
849
+ {
850
+ _curand_init_inplace(seed, subsequence, offset, state);
851
+ }
852
+
853
+ /**
854
+ * \brief Return 32-bits of pseudorandomness from an XORWOW generator.
855
+ *
856
+ * Return 32-bits of pseudorandomness from the XORWOW generator in \p state,
857
+ * increment position of generator by one.
858
+ *
859
+ * \param state - Pointer to state to update
860
+ *
861
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
862
+ */
863
+ QUALIFIERS unsigned int curand(curandStateXORWOW_t *state)
864
+ {
865
+ unsigned int t;
866
+ t = (state->v[0] ^ (state->v[0] >> 2));
867
+ state->v[0] = state->v[1];
868
+ state->v[1] = state->v[2];
869
+ state->v[2] = state->v[3];
870
+ state->v[3] = state->v[4];
871
+ state->v[4] = (state->v[4] ^ (state->v[4] <<4)) ^ (t ^ (t << 1));
872
+ state->d += 362437;
873
+ return state->v[4] + state->d;
874
+ }
875
+
876
+
877
+ /**
878
+ * \brief Return 32-bits of pseudorandomness from an Philox4_32_10 generator.
879
+ *
880
+ * Return 32-bits of pseudorandomness from the Philox4_32_10 generator in \p state,
881
+ * increment position of generator by one.
882
+ *
883
+ * \param state - Pointer to state to update
884
+ *
885
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
886
+ */
887
+
888
+ QUALIFIERS unsigned int curand(curandStatePhilox4_32_10_t *state)
889
+ {
890
+ // Maintain the invariant: output[STATE] is always "good" and
891
+ // is the next value to be returned by curand.
892
+ unsigned int ret;
893
+ switch(state->STATE++){
894
+ default:
895
+ ret = state->output.x;
896
+ break;
897
+ case 1:
898
+ ret = state->output.y;
899
+ break;
900
+ case 2:
901
+ ret = state->output.z;
902
+ break;
903
+ case 3:
904
+ ret = state->output.w;
905
+ break;
906
+ }
907
+ if(state->STATE == 4){
908
+ Philox_State_Incr(state);
909
+ state->output = curand_Philox4x32_10(state->ctr,state->key);
910
+ state->STATE = 0;
911
+ }
912
+ return ret;
913
+ }
914
+
915
+ /**
916
+ * \brief Return tuple of 4 32-bit pseudorandoms from a Philox4_32_10 generator.
917
+ *
918
+ * Return 128 bits of pseudorandomness from the Philox4_32_10 generator in \p state,
919
+ * increment position of generator by four.
920
+ *
921
+ * \param state - Pointer to state to update
922
+ *
923
+ * \return 128-bits of pseudorandomness as a uint4, all bits valid to use.
924
+ */
925
+
926
+ QUALIFIERS uint4 curand4(curandStatePhilox4_32_10_t *state)
927
+ {
928
+ uint4 r;
929
+
930
+ uint4 tmp = state->output;
931
+ Philox_State_Incr(state);
932
+ state->output= curand_Philox4x32_10(state->ctr,state->key);
933
+ switch(state->STATE){
934
+ case 0:
935
+ return tmp;
936
+ case 1:
937
+ r.x = tmp.y;
938
+ r.y = tmp.z;
939
+ r.z = tmp.w;
940
+ r.w = state->output.x;
941
+ break;
942
+ case 2:
943
+ r.x = tmp.z;
944
+ r.y = tmp.w;
945
+ r.z = state->output.x;
946
+ r.w = state->output.y;
947
+ break;
948
+ case 3:
949
+ r.x = tmp.w;
950
+ r.y = state->output.x;
951
+ r.z = state->output.y;
952
+ r.w = state->output.z;
953
+ break;
954
+ default:
955
+ // NOT possible but needed to avoid compiler warnings
956
+ return tmp;
957
+ }
958
+ return r;
959
+ }
960
+
961
+ /**
962
+ * \brief Update Philox4_32_10 state to skip \p n elements.
963
+ *
964
+ * Update the Philox4_32_10 state in \p state to skip ahead \p n elements.
965
+ *
966
+ * All values of \p n are valid.
967
+ *
968
+ * \param n - Number of elements to skip
969
+ * \param state - Pointer to state to update
970
+ */
971
+ QUALIFIERS void skipahead(unsigned long long n, curandStatePhilox4_32_10_t *state)
972
+ {
973
+ state->STATE += (n & 3);
974
+ n /= 4;
975
+ if( state->STATE > 3 ){
976
+ n += 1;
977
+ state->STATE -= 4;
978
+ }
979
+ Philox_State_Incr(state, n);
980
+ state->output = curand_Philox4x32_10(state->ctr,state->key);
981
+ }
982
+
983
+ /**
984
+ * \brief Update Philox4_32_10 state to skip ahead \p n subsequences.
985
+ *
986
+ * Update the Philox4_32_10 state in \p state to skip ahead \p n subsequences. Each
987
+ * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
988
+ * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly * n elements.
989
+ *
990
+ * All values of \p n are valid.
991
+ *
992
+ * \param n - Number of subsequences to skip
993
+ * \param state - Pointer to state to update
994
+ */
995
+ QUALIFIERS void skipahead_sequence(unsigned long long n, curandStatePhilox4_32_10_t *state)
996
+ {
997
+ Philox_State_Incr_hi(state, n);
998
+ state->output = curand_Philox4x32_10(state->ctr,state->key);
999
+ }
1000
+
1001
+ /**
1002
+ * \brief Initialize Philox4_32_10 state.
1003
+ *
1004
+ * Initialize Philox4_32_10 state in \p state with the given \p seed, p\ subsequence,
1005
+ * and \p offset.
1006
+ *
1007
+ * All input values for \p seed, \p subseqence and \p offset are legal. Each of the
1008
+ * \xmlonly<ph outputclass="xmlonly">2<sup>64</sup></ph>\endxmlonly possible
1009
+ * values of seed selects an independent sequence of length
1010
+ * \xmlonly<ph outputclass="xmlonly">2<sup>130</sup></ph>\endxmlonly.
1011
+ * The first
1012
+ * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup> * subsequence + offset</ph>\endxmlonly.
1013
+ * values of the sequence are skipped.
1014
+ * I.e., subsequences are of length
1015
+ * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly.
1016
+ *
1017
+ * \param seed - Arbitrary bits to use as a seed
1018
+ * \param subsequence - Subsequence to start at
1019
+ * \param offset - Absolute offset into subsequence
1020
+ * \param state - Pointer to state to initialize
1021
+ */
1022
+ QUALIFIERS void curand_init(unsigned long long seed,
1023
+ unsigned long long subsequence,
1024
+ unsigned long long offset,
1025
+ curandStatePhilox4_32_10_t *state)
1026
+ {
1027
+ state->ctr = make_uint4(0, 0, 0, 0);
1028
+ state->key.x = (unsigned int)seed;
1029
+ state->key.y = (unsigned int)(seed>>32);
1030
+ state->STATE = 0;
1031
+ state->boxmuller_flag = 0;
1032
+ state->boxmuller_flag_double = 0;
1033
+ state->boxmuller_extra = 0.f;
1034
+ state->boxmuller_extra_double = 0.;
1035
+ skipahead_sequence(subsequence, state);
1036
+ skipahead(offset, state);
1037
+ }
1038
+
1039
+
1040
+ /* MRG32k3a RNG */
1041
+
1042
+ /* Base generator for MRG32k3a */
1043
+ QUALIFIERS unsigned long long __curand_umad(GCC_UNUSED_PARAMETER unsigned int a, GCC_UNUSED_PARAMETER unsigned int b, GCC_UNUSED_PARAMETER unsigned long long c)
1044
+ {
1045
+ unsigned long long r = 0;
1046
+ NV_IF_TARGET(NV_PROVIDES_SM_61,
1047
+ asm("mad.wide.u32 %0, %1, %2, %3;"
1048
+ : "=l"(r) : "r"(a), "r"(b), "l"(c));
1049
+ )
1050
+ return r;
1051
+ }
1052
+ QUALIFIERS unsigned long long __curand_umul(GCC_UNUSED_PARAMETER unsigned int a, GCC_UNUSED_PARAMETER unsigned int b)
1053
+ {
1054
+ unsigned long long r = 0;
1055
+ NV_IF_TARGET(NV_PROVIDES_SM_61,
1056
+ asm("mul.wide.u32 %0, %1, %2;"
1057
+ : "=l"(r) : "r"(a), "r"(b));
1058
+ )
1059
+ return r;
1060
+ }
1061
+ QUALIFIERS double curand_MRG32k3a (curandStateMRG32k3a_t *state)
1062
+ {
1063
+ NV_IF_TARGET(NV_PROVIDES_SM_61,
1064
+ const unsigned int m1 = 4294967087u;
1065
+ const unsigned int m2 = 4294944443u;
1066
+ const unsigned int m1c = 209u;
1067
+ const unsigned int m2c = 22853u;
1068
+ const unsigned int a12 = 1403580u;
1069
+ const unsigned int a13n = 810728u;
1070
+ const unsigned int a21 = 527612u;
1071
+ const unsigned int a23n = 1370589u;
1072
+
1073
+ unsigned long long p1;
1074
+ unsigned long long p2;
1075
+ const unsigned long long p3 = __curand_umul(a13n, m1 - state->s1[0]);
1076
+ p1 = __curand_umad(a12, state->s1[1], p3);
1077
+
1078
+ // Putting addition inside and changing umul to umad
1079
+ // slowed this function down on GV100
1080
+ p1 = __curand_umul(p1 >> 32, m1c) + (p1 & 0xffffffff);
1081
+ if (p1 >= m1) p1 -= m1;
1082
+
1083
+ state->s1[0] = state->s1[1]; state->s1[1] = state->s1[2]; state->s1[2] = p1;
1084
+ const unsigned long long p4 = __curand_umul(a23n, m2 - state->s2[0]);
1085
+ p2 = __curand_umad(a21, state->s2[2], p4);
1086
+
1087
+ // Putting addition inside and changing umul to umad
1088
+ // slowed this function down on GV100
1089
+ p2 = __curand_umul(p2 >> 32, m2c) + (p2 & 0xffffffff);
1090
+ p2 = __curand_umul(p2 >> 32, m2c) + (p2 & 0xffffffff);
1091
+ if (p2 >= m2) p2 -= m2;
1092
+
1093
+ state->s2[0] = state->s2[1]; state->s2[1] = state->s2[2]; state->s2[2] = p2;
1094
+
1095
+ const unsigned int p5 = (unsigned int)p1 - (unsigned int)p2;
1096
+ if(p1 <= p2) return p5 + m1;
1097
+ return p5;
1098
+ )
1099
+ NV_IF_TARGET(NV_IS_DEVICE,
1100
+ /* nj's implementation */
1101
+ const double m1 = 4294967087.;
1102
+ const double m2 = 4294944443.;
1103
+ const double a12 = 1403580.;
1104
+ const double a13n = 810728.;
1105
+ const double a21 = 527612.;
1106
+ const double a23n = 1370589.;
1107
+
1108
+ const double rh1 = 2.3283065498378290e-010; /* (1.0 / m1)__hi */
1109
+ const double rl1 = -1.7354913086174288e-026; /* (1.0 / m1)__lo */
1110
+ const double rh2 = 2.3283188252407387e-010; /* (1.0 / m2)__hi */
1111
+ const double rl2 = 2.4081018096503646e-026; /* (1.0 / m2)__lo */
1112
+
1113
+ double q;
1114
+ double p1;
1115
+ double p2;
1116
+ p1 = a12 * state->s1[1] - a13n * state->s1[0];
1117
+ q = trunc (fma (p1, rh1, p1 * rl1));
1118
+ p1 -= q * m1;
1119
+ if (p1 < 0.0) p1 += m1;
1120
+ state->s1[0] = state->s1[1]; state->s1[1] = state->s1[2]; state->s1[2] = (unsigned int)p1;
1121
+ p2 = a21 * state->s2[2] - a23n * state->s2[0];
1122
+ q = trunc (fma (p2, rh2, p2 * rl2));
1123
+ p2 -= q * m2;
1124
+ if (p2 < 0.0) p2 += m2;
1125
+ state->s2[0] = state->s2[1]; state->s2[1] = state->s2[2]; state->s2[2] = (unsigned int)p2;
1126
+ if (p1 <= p2) return (p1 - p2 + m1);
1127
+ else return (p1 - p2);
1128
+ )
1129
+ /* end nj's implementation */
1130
+ double p1;
1131
+ double p2;
1132
+ double r;
1133
+ p1 = (MRG32K3A_A12 * state->s1[1]) - (MRG32K3A_A13N * state->s1[0]);
1134
+ p1 = curand_MRGmod(p1, MRG32K3A_MOD1);
1135
+ if (p1 < 0.0) p1 += MRG32K3A_MOD1;
1136
+ state->s1[0] = state->s1[1];
1137
+ state->s1[1] = state->s1[2];
1138
+ state->s1[2] = (unsigned int)p1;
1139
+ p2 = (MRG32K3A_A21 * state->s2[2]) - (MRG32K3A_A23N * state->s2[0]);
1140
+ p2 = curand_MRGmod(p2, MRG32K3A_MOD2);
1141
+ if (p2 < 0) p2 += MRG32K3A_MOD2;
1142
+ state->s2[0] = state->s2[1];
1143
+ state->s2[1] = state->s2[2];
1144
+ state->s2[2] = (unsigned int)p2;
1145
+ r = p1 - p2;
1146
+ if (r <= 0) r += MRG32K3A_MOD1;
1147
+ return r;
1148
+ }
1149
+
1150
+
1151
+ /**
1152
+ * \brief Return 32-bits of pseudorandomness from an MRG32k3a generator.
1153
+ *
1154
+ * Return 32-bits of pseudorandomness from the MRG32k3a generator in \p state,
1155
+ * increment position of generator by one.
1156
+ *
1157
+ * \param state - Pointer to state to update
1158
+ *
1159
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
1160
+ */
1161
+ QUALIFIERS unsigned int curand(curandStateMRG32k3a_t *state)
1162
+ {
1163
+ double dRet;
1164
+ dRet = (double)curand_MRG32k3a(state)*(double)MRG32K3A_BITS_NORM;
1165
+ return (unsigned int)dRet;
1166
+ }
1167
+
1168
+
1169
+
1170
+ /**
1171
+ * \brief Update MRG32k3a state to skip \p n elements.
1172
+ *
1173
+ * Update the MRG32k3a state in \p state to skip ahead \p n elements.
1174
+ *
1175
+ * All values of \p n are valid. Large values require more computation and so
1176
+ * will take more time to complete.
1177
+ *
1178
+ * \param n - Number of elements to skip
1179
+ * \param state - Pointer to state to update
1180
+ */
1181
+ QUALIFIERS void skipahead(unsigned long long n, curandStateMRG32k3a_t *state)
1182
+ {
1183
+ unsigned int t[3][3];
1184
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
1185
+ curand_MRGmatPow3x3( mrg32k3aM1, t, MRG32K3A_MOD1, n);
1186
+ curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
1187
+ curand_MRGmatPow3x3(mrg32k3aM2, t, MRG32K3A_MOD2, n);
1188
+ curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
1189
+ ,
1190
+ curand_MRGmatPow3x3( mrg32k3aM1Host, t, MRG32K3A_MOD1, n);
1191
+ curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
1192
+ curand_MRGmatPow3x3(mrg32k3aM2Host, t, MRG32K3A_MOD2, n);
1193
+ curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
1194
+ )
1195
+ }
1196
+
1197
+ /**
1198
+ * \brief Update MRG32k3a state to skip ahead \p n subsequences.
1199
+ *
1200
+ * Update the MRG32k3a state in \p state to skip ahead \p n subsequences. Each
1201
+ * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly
1202
+ *
1203
+ * \xmlonly<ph outputclass="xmlonly">2<sup>76</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
1204
+ * \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly * n elements.
1205
+ *
1206
+ * Valid values of \p n are 0 to \xmlonly<ph outputclass="xmlonly">2<sup>51</sup></ph>\endxmlonly. Note \p n will be masked to 51 bits
1207
+ *
1208
+ * \param n - Number of subsequences to skip
1209
+ * \param state - Pointer to state to update
1210
+ */
1211
+ QUALIFIERS void skipahead_subsequence(unsigned long long n, curandStateMRG32k3a_t *state)
1212
+ {
1213
+ unsigned int t[3][3];
1214
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
1215
+ curand_MRGmatPow3x3( mrg32k3aM1SubSeq, t, MRG32K3A_MOD1, n);
1216
+ curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
1217
+ curand_MRGmatPow3x3( mrg32k3aM2SubSeq, t, MRG32K3A_MOD2, n);
1218
+ curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
1219
+ ,
1220
+ curand_MRGmatPow3x3( mrg32k3aM1SubSeqHost, t, MRG32K3A_MOD1, n);
1221
+ curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
1222
+ curand_MRGmatPow3x3( mrg32k3aM2SubSeqHost, t, MRG32K3A_MOD2, n);
1223
+ curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
1224
+ )
1225
+ }
1226
+
1227
+ /**
1228
+ * \brief Update MRG32k3a state to skip ahead \p n sequences.
1229
+ *
1230
+ * Update the MRG32k3a state in \p state to skip ahead \p n sequences. Each
1231
+ * sequence is \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
1232
+ * \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly * n elements.
1233
+ *
1234
+ * All values of \p n are valid. Large values require more computation and so
1235
+ * will take more time to complete.
1236
+ *
1237
+ * \param n - Number of sequences to skip
1238
+ * \param state - Pointer to state to update
1239
+ */
1240
+ QUALIFIERS void skipahead_sequence(unsigned long long n, curandStateMRG32k3a_t *state)
1241
+ {
1242
+ unsigned int t[3][3];
1243
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
1244
+ curand_MRGmatPow3x3( mrg32k3aM1Seq, t, MRG32K3A_MOD1, n);
1245
+ curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
1246
+ curand_MRGmatPow3x3( mrg32k3aM2Seq, t, MRG32K3A_MOD2, n);
1247
+ curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
1248
+ ,
1249
+ curand_MRGmatPow3x3( mrg32k3aM1SeqHost, t, MRG32K3A_MOD1, n);
1250
+ curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
1251
+ curand_MRGmatPow3x3( mrg32k3aM2SeqHost, t, MRG32K3A_MOD2, n);
1252
+ curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
1253
+ )
1254
+ }
1255
+
1256
+
1257
+ /**
1258
+ * \brief Initialize MRG32k3a state.
1259
+ *
1260
+ * Initialize MRG32k3a state in \p state with the given \p seed, \p subsequence,
1261
+ * and \p offset.
1262
+ *
1263
+ * All input values of \p seed, \p subsequence, and \p offset are legal.
1264
+ * \p subsequence will be truncated to 51 bits to avoid running into the next sequence
1265
+ *
1266
+ * A value of 0 for \p seed sets the state to the values of the original
1267
+ * published version of the \p MRG32k3a algorithm.
1268
+ *
1269
+ * \param seed - Arbitrary bits to use as a seed
1270
+ * \param subsequence - Subsequence to start at
1271
+ * \param offset - Absolute offset into sequence
1272
+ * \param state - Pointer to state to initialize
1273
+ */
1274
+ QUALIFIERS void curand_init(unsigned long long seed,
1275
+ unsigned long long subsequence,
1276
+ unsigned long long offset,
1277
+ curandStateMRG32k3a_t *state)
1278
+ {
1279
+ int i;
1280
+ for ( i=0; i<3; i++ ) {
1281
+ state->s1[i] = 12345u;
1282
+ state->s2[i] = 12345u;
1283
+ }
1284
+ if (seed != 0ull) {
1285
+ unsigned int x1 = ((unsigned int)seed) ^ 0x55555555UL;
1286
+ unsigned int x2 = (unsigned int)((seed >> 32) ^ 0xAAAAAAAAUL);
1287
+ state->s1[0] = (unsigned int)curand_MRGmodMul(x1, state->s1[0], MRG32K3A_MOD1);
1288
+ state->s1[1] = (unsigned int)curand_MRGmodMul(x2, state->s1[1], MRG32K3A_MOD1);
1289
+ state->s1[2] = (unsigned int)curand_MRGmodMul(x1, state->s1[2], MRG32K3A_MOD1);
1290
+ state->s2[0] = (unsigned int)curand_MRGmodMul(x2, state->s2[0], MRG32K3A_MOD2);
1291
+ state->s2[1] = (unsigned int)curand_MRGmodMul(x1, state->s2[1], MRG32K3A_MOD2);
1292
+ state->s2[2] = (unsigned int)curand_MRGmodMul(x2, state->s2[2], MRG32K3A_MOD2);
1293
+ }
1294
+ skipahead_subsequence( subsequence, state );
1295
+ skipahead( offset, state );
1296
+ state->boxmuller_flag = 0;
1297
+ state->boxmuller_flag_double = 0;
1298
+ state->boxmuller_extra = 0.f;
1299
+ state->boxmuller_extra_double = 0.;
1300
+ }
1301
+
1302
+ /**
1303
+ * \brief Update Sobol32 state to skip \p n elements.
1304
+ *
1305
+ * Update the Sobol32 state in \p state to skip ahead \p n elements.
1306
+ *
1307
+ * All values of \p n are valid.
1308
+ *
1309
+ * \param n - Number of elements to skip
1310
+ * \param state - Pointer to state to update
1311
+ */
1312
+ template <typename T>
1313
+ QUALIFIERS
1314
+ typename CURAND_STD::enable_if<CURAND_STD::is_same<curandStateSobol32_t*, T>::value || CURAND_STD::is_same<curandStateScrambledSobol32_t*, T>::value>::type
1315
+ skipahead(unsigned int n, T state)
1316
+ {
1317
+ unsigned int i_gray;
1318
+ state->x = state->c;
1319
+ state->i += n;
1320
+ /* Convert state->i to gray code */
1321
+ i_gray = state->i ^ (state->i >> 1);
1322
+ for(unsigned int k = 0; k < 32; k++) {
1323
+ if(i_gray & (1 << k)) {
1324
+ state->x ^= state->direction_vectors[k];
1325
+ }
1326
+ }
1327
+ return;
1328
+ }
1329
+
1330
+ /**
1331
+ * \brief Update Sobol64 state to skip \p n elements.
1332
+ *
1333
+ * Update the Sobol64 state in \p state to skip ahead \p n elements.
1334
+ *
1335
+ * All values of \p n are valid.
1336
+ *
1337
+ * \param n - Number of elements to skip
1338
+ * \param state - Pointer to state to update
1339
+ */
1340
+ template <typename T>
1341
+ QUALIFIERS
1342
+ typename CURAND_STD::enable_if<CURAND_STD::is_same<curandStateSobol64_t*, T>::value || CURAND_STD::is_same<curandStateScrambledSobol64_t*, T>::value>::type
1343
+ skipahead(unsigned long long n, T state)
1344
+ {
1345
+ unsigned long long i_gray;
1346
+ state->x = state->c;
1347
+ state->i += n;
1348
+ /* Convert state->i to gray code */
1349
+ i_gray = state->i ^ (state->i >> 1);
1350
+ for(unsigned k = 0; k < 64; k++) {
1351
+ if(i_gray & (1ULL << k)) {
1352
+ state->x ^= state->direction_vectors[k];
1353
+ }
1354
+ }
1355
+ return;
1356
+ }
1357
+
1358
+ /**
1359
+ * \brief Initialize Sobol32 state.
1360
+ *
1361
+ * Initialize Sobol32 state in \p state with the given \p direction \p vectors and
1362
+ * \p offset.
1363
+ *
1364
+ * The direction vector is a device pointer to an array of 32 unsigned ints.
1365
+ * All input values of \p offset are legal.
1366
+ *
1367
+ * \param direction_vectors - Pointer to array of 32 unsigned ints representing the
1368
+ * direction vectors for the desired dimension
1369
+ * \param offset - Absolute offset into sequence
1370
+ * \param state - Pointer to state to initialize
1371
+ */
1372
+ QUALIFIERS void curand_init(curandDirectionVectors32_t direction_vectors,
1373
+ unsigned int offset,
1374
+ curandStateSobol32_t *state)
1375
+ {
1376
+ state->i = 0;
1377
+ state->c = 0;
1378
+ for(int i = 0; i < 32; i++) {
1379
+ state->direction_vectors[i] = direction_vectors[i];
1380
+ }
1381
+ state->x = 0;
1382
+ skipahead<curandStateSobol32_t *>(offset, state);
1383
+ }
1384
+ /**
1385
+ * \brief Initialize Scrambled Sobol32 state.
1386
+ *
1387
+ * Initialize Sobol32 state in \p state with the given \p direction \p vectors and
1388
+ * \p offset.
1389
+ *
1390
+ * The direction vector is a device pointer to an array of 32 unsigned ints.
1391
+ * All input values of \p offset are legal.
1392
+ *
1393
+ * \param direction_vectors - Pointer to array of 32 unsigned ints representing the
1394
+ direction vectors for the desired dimension
1395
+ * \param scramble_c Scramble constant
1396
+ * \param offset - Absolute offset into sequence
1397
+ * \param state - Pointer to state to initialize
1398
+ */
1399
+ QUALIFIERS void curand_init(curandDirectionVectors32_t direction_vectors,
1400
+ unsigned int scramble_c,
1401
+ unsigned int offset,
1402
+ curandStateScrambledSobol32_t *state)
1403
+ {
1404
+ state->i = 0;
1405
+ state->c = scramble_c;
1406
+ for(int i = 0; i < 32; i++) {
1407
+ state->direction_vectors[i] = direction_vectors[i];
1408
+ }
1409
+ state->x = state->c;
1410
+ skipahead<curandStateScrambledSobol32_t *>(offset, state);
1411
+ }
1412
+
1413
+ QUALIFIERS int __curand_find_trailing_zero(unsigned int x)
1414
+ {
1415
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
1416
+ int y = __ffs(~x);
1417
+ if(y)
1418
+ return y - 1;
1419
+ return 31;
1420
+ ,
1421
+ int i = 1;
1422
+ while(x & 1) {
1423
+ i++;
1424
+ x >>= 1;
1425
+ }
1426
+ i = i - 1;
1427
+ return i == 32 ? 31 : i;
1428
+ )
1429
+ }
1430
+
1431
+ QUALIFIERS int __curand_find_trailing_zero(unsigned long long x)
1432
+ {
1433
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
1434
+ int y = __ffsll(~x);
1435
+ if(y)
1436
+ return y - 1;
1437
+ return 63;
1438
+ ,
1439
+ int i = 1;
1440
+ while(x & 1) {
1441
+ i++;
1442
+ x >>= 1;
1443
+ }
1444
+ i = i - 1;
1445
+ return i == 64 ? 63 : i;
1446
+ )
1447
+ }
1448
+
1449
+ /**
1450
+ * \brief Initialize Sobol64 state.
1451
+ *
1452
+ * Initialize Sobol64 state in \p state with the given \p direction \p vectors and
1453
+ * \p offset.
1454
+ *
1455
+ * The direction vector is a device pointer to an array of 64 unsigned long longs.
1456
+ * All input values of \p offset are legal.
1457
+ *
1458
+ * \param direction_vectors - Pointer to array of 64 unsigned long longs representing the
1459
+ direction vectors for the desired dimension
1460
+ * \param offset - Absolute offset into sequence
1461
+ * \param state - Pointer to state to initialize
1462
+ */
1463
+ QUALIFIERS void curand_init(curandDirectionVectors64_t direction_vectors,
1464
+ unsigned long long offset,
1465
+ curandStateSobol64_t *state)
1466
+ {
1467
+ state->i = 0;
1468
+ state->c = 0;
1469
+ for(int i = 0; i < 64; i++) {
1470
+ state->direction_vectors[i] = direction_vectors[i];
1471
+ }
1472
+ state->x = 0;
1473
+ skipahead<curandStateSobol64_t *>(offset, state);
1474
+ }
1475
+
1476
+ /**
1477
+ * \brief Initialize Scrambled Sobol64 state.
1478
+ *
1479
+ * Initialize Sobol64 state in \p state with the given \p direction \p vectors and
1480
+ * \p offset.
1481
+ *
1482
+ * The direction vector is a device pointer to an array of 64 unsigned long longs.
1483
+ * All input values of \p offset are legal.
1484
+ *
1485
+ * \param direction_vectors - Pointer to array of 64 unsigned long longs representing the
1486
+ direction vectors for the desired dimension
1487
+ * \param scramble_c Scramble constant
1488
+ * \param offset - Absolute offset into sequence
1489
+ * \param state - Pointer to state to initialize
1490
+ */
1491
+ QUALIFIERS void curand_init(curandDirectionVectors64_t direction_vectors,
1492
+ unsigned long long scramble_c,
1493
+ unsigned long long offset,
1494
+ curandStateScrambledSobol64_t *state)
1495
+ {
1496
+ state->i = 0;
1497
+ state->c = scramble_c;
1498
+ for(int i = 0; i < 64; i++) {
1499
+ state->direction_vectors[i] = direction_vectors[i];
1500
+ }
1501
+ state->x = state->c;
1502
+ skipahead<curandStateScrambledSobol64_t *>(offset, state);
1503
+ }
1504
+
1505
+ /**
1506
+ * \brief Return 32-bits of quasirandomness from a Sobol32 generator.
1507
+ *
1508
+ * Return 32-bits of quasirandomness from the Sobol32 generator in \p state,
1509
+ * increment position of generator by one.
1510
+ *
1511
+ * \param state - Pointer to state to update
1512
+ *
1513
+ * \return 32-bits of quasirandomness as an unsigned int, all bits valid to use.
1514
+ */
1515
+
1516
+ QUALIFIERS unsigned int curand(curandStateSobol32_t * state)
1517
+ {
1518
+ /* Moving from i to i+1 element in gray code is flipping one bit,
1519
+ the trailing zero bit of i
1520
+ */
1521
+ unsigned int res = state->x;
1522
+ state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
1523
+ state->i ++;
1524
+ return res;
1525
+ }
1526
+
1527
+ /**
1528
+ * \brief Return 32-bits of quasirandomness from a scrambled Sobol32 generator.
1529
+ *
1530
+ * Return 32-bits of quasirandomness from the scrambled Sobol32 generator in \p state,
1531
+ * increment position of generator by one.
1532
+ *
1533
+ * \param state - Pointer to state to update
1534
+ *
1535
+ * \return 32-bits of quasirandomness as an unsigned int, all bits valid to use.
1536
+ */
1537
+
1538
+ QUALIFIERS unsigned int curand(curandStateScrambledSobol32_t * state)
1539
+ {
1540
+ /* Moving from i to i+1 element in gray code is flipping one bit,
1541
+ the trailing zero bit of i
1542
+ */
1543
+ unsigned int res = state->x;
1544
+ state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
1545
+ state->i ++;
1546
+ return res;
1547
+ }
1548
+
1549
+ /**
1550
+ * \brief Return 64-bits of quasirandomness from a Sobol64 generator.
1551
+ *
1552
+ * Return 64-bits of quasirandomness from the Sobol64 generator in \p state,
1553
+ * increment position of generator by one.
1554
+ *
1555
+ * \param state - Pointer to state to update
1556
+ *
1557
+ * \return 64-bits of quasirandomness as an unsigned long long, all bits valid to use.
1558
+ */
1559
+
1560
+ QUALIFIERS unsigned long long curand(curandStateSobol64_t * state)
1561
+ {
1562
+ /* Moving from i to i+1 element in gray code is flipping one bit,
1563
+ the trailing zero bit of i
1564
+ */
1565
+ unsigned long long res = state->x;
1566
+ state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
1567
+ state->i ++;
1568
+ return res;
1569
+ }
1570
+
1571
+ /**
1572
+ * \brief Return 64-bits of quasirandomness from a scrambled Sobol64 generator.
1573
+ *
1574
+ * Return 64-bits of quasirandomness from the scrambled Sobol32 generator in \p state,
1575
+ * increment position of generator by one.
1576
+ *
1577
+ * \param state - Pointer to state to update
1578
+ *
1579
+ * \return 64-bits of quasirandomness as an unsigned long long, all bits valid to use.
1580
+ */
1581
+
1582
+ QUALIFIERS unsigned long long curand(curandStateScrambledSobol64_t * state)
1583
+ {
1584
+ /* Moving from i to i+1 element in gray code is flipping one bit,
1585
+ the trailing zero bit of i
1586
+ */
1587
+ unsigned long long res = state->x;
1588
+ state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
1589
+ state->i ++;
1590
+ return res;
1591
+ }
1592
+
1593
+ #include "curand_uniform.h"
1594
+ #include "curand_normal.h"
1595
+ #include "curand_lognormal.h"
1596
+ #include "curand_poisson.h"
1597
+ #include "curand_discrete2.h"
1598
+
1599
+ __device__ static inline unsigned int *__get_precalculated_matrix(int n)
1600
+ {
1601
+ if(n == 0) {
1602
+ return precalc_xorwow_matrix[n];
1603
+ }
1604
+ if(n == 2) {
1605
+ return precalc_xorwow_offset_matrix[n];
1606
+ }
1607
+ return precalc_xorwow_matrix[n];
1608
+ }
1609
+
1610
+ #ifndef __CUDACC_RTC__
1611
+ __host__ static inline unsigned int *__get_precalculated_matrix_host(int n)
1612
+ {
1613
+ if(n == 1) {
1614
+ return precalc_xorwow_matrix_host[n];
1615
+ }
1616
+ if(n == 3) {
1617
+ return precalc_xorwow_offset_matrix_host[n];
1618
+ }
1619
+ return precalc_xorwow_matrix_host[n];
1620
+ }
1621
+ #endif // #ifndef __CUDACC_RTC__
1622
+
1623
+ __device__ static inline unsigned int *__get_mrg32k3a_matrix(int n)
1624
+ {
1625
+ if(n == 0) {
1626
+ return mrg32k3aM1[n][0];
1627
+ }
1628
+ if(n == 2) {
1629
+ return mrg32k3aM2[n][0];
1630
+ }
1631
+ if(n == 4) {
1632
+ return mrg32k3aM1SubSeq[n][0];
1633
+ }
1634
+ if(n == 6) {
1635
+ return mrg32k3aM2SubSeq[n][0];
1636
+ }
1637
+ if(n == 8) {
1638
+ return mrg32k3aM1Seq[n][0];
1639
+ }
1640
+ if(n == 10) {
1641
+ return mrg32k3aM2Seq[n][0];
1642
+ }
1643
+ return mrg32k3aM1[n][0];
1644
+ }
1645
+
1646
+ #ifndef __CUDACC_RTC__
1647
+ __host__ static inline unsigned int *__get_mrg32k3a_matrix_host(int n)
1648
+ {
1649
+ if(n == 1) {
1650
+ return mrg32k3aM1Host[n][0];
1651
+ }
1652
+ if(n == 3) {
1653
+ return mrg32k3aM2Host[n][0];
1654
+ }
1655
+ if(n == 5) {
1656
+ return mrg32k3aM1SubSeqHost[n][0];
1657
+ }
1658
+ if(n == 7) {
1659
+ return mrg32k3aM2SubSeqHost[n][0];
1660
+ }
1661
+ if(n == 9) {
1662
+ return mrg32k3aM1SeqHost[n][0];
1663
+ }
1664
+ if(n == 11) {
1665
+ return mrg32k3aM2SeqHost[n][0];
1666
+ }
1667
+ return mrg32k3aM1Host[n][0];
1668
+ }
1669
+
1670
+ __host__ static inline double *__get__cr_lgamma_table_host(void) {
1671
+ return __cr_lgamma_table;
1672
+ }
1673
+ #endif // #ifndef __CUDACC_RTC__
1674
+
1675
+ /** @} */
1676
+
1677
+ #endif // !defined(CURAND_KERNEL_H_)
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_lognormal.h ADDED
@@ -0,0 +1,697 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * The source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * The Licensed Deliverables contained herein are PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+
51
+ #if !defined(CURAND_LOGNORMAL_H_)
52
+ #define CURAND_LOGNORMAL_H_
53
+
54
+ /**
55
+ * \defgroup DEVICE Device API
56
+ *
57
+ * @{
58
+ */
59
+
60
+ #ifndef __CUDACC_RTC__
61
+ #include <math.h>
62
+ #endif // __CUDACC_RTC__
63
+
64
+ #include "curand_mrg32k3a.h"
65
+ #include "curand_mtgp32_kernel.h"
66
+ #include "curand_philox4x32_x.h"
67
+
68
+ /**
69
+ * \brief Return a log-normally distributed float from an XORWOW generator.
70
+ *
71
+ * Return a single log-normally distributed float derived from a normal
72
+ * distribution with mean \p mean and standard deviation \p stddev
73
+ * from the XORWOW generator in \p state,
74
+ * increment position of generator by one.
75
+ *
76
+ * The implementation uses a Box-Muller transform to generate two
77
+ * normally distributed results, transforms them to log-normal distribution,
78
+ * then returns them one at a time.
79
+ * See ::curand_log_normal2() for a more efficient version that returns
80
+ * both results at once.
81
+ *
82
+ * \param state - Pointer to state to update
83
+ * \param mean - Mean of the related normal distribution
84
+ * \param stddev - Standard deviation of the related normal distribution
85
+ *
86
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
87
+ */
88
+ QUALIFIERS float curand_log_normal(curandStateXORWOW_t *state, float mean, float stddev)
89
+ {
90
+ if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
91
+ unsigned int x, y;
92
+ x = curand(state);
93
+ y = curand(state);
94
+ float2 v = _curand_box_muller(x, y);
95
+ state->boxmuller_extra = expf(mean + (stddev * v.y));
96
+ state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
97
+ return expf(mean + (stddev * v.x));
98
+ }
99
+ state->boxmuller_flag = 0;
100
+ return state->boxmuller_extra;
101
+ }
102
+
103
+ /**
104
+ * \brief Return a log-normally distributed float from an Philox4_32_10 generator.
105
+ *
106
+ * Return a single log-normally distributed float derived from a normal
107
+ * distribution with mean \p mean and standard deviation \p stddev
108
+ * from the Philox4_32_10 generator in \p state,
109
+ * increment position of generator by one.
110
+ *
111
+ * The implementation uses a Box-Muller transform to generate two
112
+ * normally distributed results, transforms them to log-normal distribution,
113
+ * then returns them one at a time.
114
+ * See ::curand_log_normal2() for a more efficient version that returns
115
+ * both results at once.
116
+ *
117
+ * \param state - Pointer to state to update
118
+ * \param mean - Mean of the related normal distribution
119
+ * \param stddev - Standard deviation of the related normal distribution
120
+ *
121
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
122
+ */
123
+
124
+ QUALIFIERS float curand_log_normal(curandStatePhilox4_32_10_t *state, float mean, float stddev)
125
+ {
126
+ if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
127
+ unsigned int x, y;
128
+ x = curand(state);
129
+ y = curand(state);
130
+ float2 v = _curand_box_muller(x, y);
131
+ state->boxmuller_extra = expf(mean + (stddev * v.y));
132
+ state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
133
+ return expf(mean + (stddev * v.x));
134
+ }
135
+ state->boxmuller_flag = 0;
136
+ return state->boxmuller_extra;
137
+ }
138
+
139
+ /**
140
+ * \brief Return two normally distributed floats from an XORWOW generator.
141
+ *
142
+ * Return two log-normally distributed floats derived from a normal
143
+ * distribution with mean \p mean and standard deviation \p stddev
144
+ * from the XORWOW generator in \p state,
145
+ * increment position of generator by two.
146
+ *
147
+ * The implementation uses a Box-Muller transform to generate two
148
+ * normally distributed results, then transforms them to log-normal.
149
+ *
150
+ * \param state - Pointer to state to update
151
+ * \param mean - Mean of the related normal distribution
152
+ * \param stddev - Standard deviation of the related normal distribution
153
+ *
154
+ * \return Log-normally distributed float2 where each element is from a
155
+ * distribution with mean \p mean and standard deviation \p stddev
156
+ */
157
+ QUALIFIERS float2 curand_log_normal2(curandStateXORWOW_t *state, float mean, float stddev)
158
+ {
159
+ float2 v = curand_box_muller(state);
160
+ v.x = expf(mean + (stddev * v.x));
161
+ v.y = expf(mean + (stddev * v.y));
162
+ return v;
163
+ }
164
+
165
+ /**
166
+ * \brief Return two normally distributed floats from an Philox4_32_10 generator.
167
+ *
168
+ * Return two log-normally distributed floats derived from a normal
169
+ * distribution with mean \p mean and standard deviation \p stddev
170
+ * from the Philox4_32_10 generator in \p state,
171
+ * increment position of generator by two.
172
+ *
173
+ * The implementation uses a Box-Muller transform to generate two
174
+ * normally distributed results, then transforms them to log-normal.
175
+ *
176
+ * \param state - Pointer to state to update
177
+ * \param mean - Mean of the related normal distribution
178
+ * \param stddev - Standard deviation of the related normal distribution
179
+ *
180
+ * \return Log-normally distributed float2 where each element is from a
181
+ * distribution with mean \p mean and standard deviation \p stddev
182
+ */
183
+ QUALIFIERS float2 curand_log_normal2(curandStatePhilox4_32_10_t *state, float mean, float stddev)
184
+ {
185
+ float2 v = curand_box_muller(state);
186
+ v.x = expf(mean + (stddev * v.x));
187
+ v.y = expf(mean + (stddev * v.y));
188
+ return v;
189
+ }
190
+ /**
191
+ * \brief Return four normally distributed floats from an Philox4_32_10 generator.
192
+ *
193
+ * Return four log-normally distributed floats derived from a normal
194
+ * distribution with mean \p mean and standard deviation \p stddev
195
+ * from the Philox4_32_10 generator in \p state,
196
+ * increment position of generator by four.
197
+ *
198
+ * The implementation uses a Box-Muller transform to generate two
199
+ * normally distributed results, then transforms them to log-normal.
200
+ *
201
+ * \param state - Pointer to state to update
202
+ * \param mean - Mean of the related normal distribution
203
+ * \param stddev - Standard deviation of the related normal distribution
204
+ *
205
+ * \return Log-normally distributed float4 where each element is from a
206
+ * distribution with mean \p mean and standard deviation \p stddev
207
+ */
208
+ QUALIFIERS float4 curand_log_normal4(curandStatePhilox4_32_10_t *state, float mean, float stddev)
209
+ {
210
+ float4 v = curand_box_muller4(state);
211
+ v.x = expf(mean + (stddev * v.x));
212
+ v.y = expf(mean + (stddev * v.y));
213
+ v.z = expf(mean + (stddev * v.z));
214
+ v.w = expf(mean + (stddev * v.w));
215
+ return v;
216
+ }
217
+
218
+ /**
219
+ * \brief Return a log-normally distributed float from an MRG32k3a generator.
220
+ *
221
+ * Return a single log-normally distributed float derived from a normal
222
+ * distribution with mean \p mean and standard deviation \p stddev
223
+ * from the MRG32k3a generator in \p state,
224
+ * increment position of generator by one.
225
+ *
226
+ * The implementation uses a Box-Muller transform to generate two
227
+ * normally distributed results, transforms them to log-normal distribution,
228
+ * then returns them one at a time.
229
+ * See ::curand_log_normal2() for a more efficient version that returns
230
+ * both results at once.
231
+ *
232
+ * \param state - Pointer to state to update
233
+ * \param mean - Mean of the related normal distribution
234
+ * \param stddev - Standard deviation of the related normal distribution
235
+ *
236
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
237
+ */
238
+ QUALIFIERS float curand_log_normal(curandStateMRG32k3a_t *state, float mean, float stddev)
239
+ {
240
+ if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
241
+ float2 v = curand_box_muller_mrg(state);
242
+ state->boxmuller_extra = expf(mean + (stddev * v.y));
243
+ state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
244
+ return expf(mean + (stddev * v.x));
245
+ }
246
+ state->boxmuller_flag = 0;
247
+ return state->boxmuller_extra;
248
+ }
249
+
250
+ /**
251
+ * \brief Return two normally distributed floats from an MRG32k3a generator.
252
+ *
253
+ * Return two log-normally distributed floats derived from a normal
254
+ * distribution with mean \p mean and standard deviation \p stddev
255
+ * from the MRG32k3a generator in \p state,
256
+ * increment position of generator by two.
257
+ *
258
+ * The implementation uses a Box-Muller transform to generate two
259
+ * normally distributed results, then transforms them to log-normal.
260
+ *
261
+ * \param state - Pointer to state to update
262
+ * \param mean - Mean of the related normal distribution
263
+ * \param stddev - Standard deviation of the related normal distribution
264
+ *
265
+ * \return Log-normally distributed float2 where each element is from a
266
+ * distribution with mean \p mean and standard deviation \p stddev
267
+ */
268
+ QUALIFIERS float2 curand_log_normal2(curandStateMRG32k3a_t *state, float mean, float stddev)
269
+ {
270
+ float2 v = curand_box_muller_mrg(state);
271
+ v.x = expf(mean + (stddev * v.x));
272
+ v.y = expf(mean + (stddev * v.y));
273
+ return v;
274
+ }
275
+
276
+ /**
277
+ * \brief Return a log-normally distributed float from an MTGP32 generator.
278
+ *
279
+ * Return a single log-normally distributed float derived from a normal
280
+ * distribution with mean \p mean and standard deviation \p stddev
281
+ * from the MTGP32 generator in \p state,
282
+ * increment position of generator.
283
+ *
284
+ * The implementation uses the inverse cumulative distribution function
285
+ * to generate a normally distributed result, then transforms the result
286
+ * to log-normal.
287
+ *
288
+ * \param state - Pointer to state to update
289
+ * \param mean - Mean of the related normal distribution
290
+ * \param stddev - Standard deviation of the related normal distribution
291
+ *
292
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
293
+ */
294
+ QUALIFIERS float curand_log_normal(curandStateMtgp32_t *state, float mean, float stddev)
295
+ {
296
+ return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
297
+ }
298
+
299
+ /**
300
+ * \brief Return a log-normally distributed float from a Sobol32 generator.
301
+ *
302
+ * Return a single log-normally distributed float derived from a normal
303
+ * distribution with mean \p mean and standard deviation \p stddev
304
+ * from the Sobol32 generator in \p state,
305
+ * increment position of generator by one.
306
+ *
307
+ * The implementation uses the inverse cumulative distribution function
308
+ * to generate a normally distributed result, then transforms the result
309
+ * to log-normal.
310
+ *
311
+ * \param state - Pointer to state to update
312
+ * \param mean - Mean of the related normal distribution
313
+ * \param stddev - Standard deviation of the related normal distribution
314
+ *
315
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
316
+ */
317
+ QUALIFIERS float curand_log_normal(curandStateSobol32_t *state, float mean, float stddev)
318
+ {
319
+ return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
320
+ }
321
+ /**
322
+ * \brief Return a log-normally distributed float from a scrambled Sobol32 generator.
323
+ *
324
+ * Return a single log-normally distributed float derived from a normal
325
+ * distribution with mean \p mean and standard deviation \p stddev
326
+ * from the scrambled Sobol32 generator in \p state,
327
+ * increment position of generator by one.
328
+ *
329
+ * The implementation uses the inverse cumulative distribution function
330
+ * to generate a normally distributed result, then transforms the result
331
+ * to log-normal.
332
+ *
333
+ * \param state - Pointer to state to update
334
+ * \param mean - Mean of the related normal distribution
335
+ * \param stddev - Standard deviation of the related normal distribution
336
+ *
337
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
338
+ */
339
+ QUALIFIERS float curand_log_normal(curandStateScrambledSobol32_t *state, float mean, float stddev)
340
+ {
341
+ return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
342
+ }
343
+
344
+ /**
345
+ * \brief Return a log-normally distributed float from a Sobol64 generator.
346
+ *
347
+ * Return a single log-normally distributed float derived from a normal
348
+ * distribution with mean \p mean and standard deviation \p stddev
349
+ * from the Sobol64 generator in \p state,
350
+ * increment position of generator by one.
351
+ *
352
+ * The implementation uses the inverse cumulative distribution function
353
+ * to generate normally distributed results, then converts to log-normal
354
+ * distribution.
355
+ *
356
+ * \param state - Pointer to state to update
357
+ * \param mean - Mean of the related normal distribution
358
+ * \param stddev - Standard deviation of the related normal distribution
359
+ *
360
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
361
+ */
362
+ QUALIFIERS float curand_log_normal(curandStateSobol64_t *state, float mean, float stddev)
363
+ {
364
+ return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
365
+ }
366
+
367
+ /**
368
+ * \brief Return a log-normally distributed float from a scrambled Sobol64 generator.
369
+ *
370
+ * Return a single log-normally distributed float derived from a normal
371
+ * distribution with mean \p mean and standard deviation \p stddev
372
+ * from the scrambled Sobol64 generator in \p state,
373
+ * increment position of generator by one.
374
+ *
375
+ * The implementation uses the inverse cumulative distribution function
376
+ * to generate normally distributed results, then converts to log-normal
377
+ * distribution.
378
+ *
379
+ * \param state - Pointer to state to update
380
+ * \param mean - Mean of the related normal distribution
381
+ * \param stddev - Standard deviation of the related normal distribution
382
+ *
383
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
384
+ */
385
+ QUALIFIERS float curand_log_normal(curandStateScrambledSobol64_t *state, float mean, float stddev)
386
+ {
387
+ return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
388
+ }
389
+
390
+ /**
391
+ * \brief Return a log-normally distributed double from an XORWOW generator.
392
+ *
393
+ * Return a single normally distributed double derived from a normal
394
+ * distribution with mean \p mean and standard deviation \p stddev
395
+ * from the XORWOW generator in \p state,
396
+ * increment position of generator.
397
+ *
398
+ * The implementation uses a Box-Muller transform to generate two
399
+ * normally distributed results, transforms them to log-normal distribution,
400
+ * then returns them one at a time.
401
+ * See ::curand_log_normal2_double() for a more efficient version that returns
402
+ * both results at once.
403
+ *
404
+ * \param state - Pointer to state to update
405
+ * \param mean - Mean of the related normal distribution
406
+ * \param stddev - Standard deviation of the related normal distribution
407
+ *
408
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
409
+ */
410
+
411
+ QUALIFIERS double curand_log_normal_double(curandStateXORWOW_t *state, double mean, double stddev)
412
+ {
413
+ if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
414
+ unsigned int x0, x1, y0, y1;
415
+ x0 = curand(state);
416
+ x1 = curand(state);
417
+ y0 = curand(state);
418
+ y1 = curand(state);
419
+ double2 v = _curand_box_muller_double(x0, x1, y0, y1);
420
+ state->boxmuller_extra_double = exp(mean + (stddev * v.y));
421
+ state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
422
+ return exp(mean + (stddev * v.x));
423
+ }
424
+ state->boxmuller_flag_double = 0;
425
+ return state->boxmuller_extra_double;
426
+ }
427
+
428
+ /**
429
+ * \brief Return a log-normally distributed double from an Philox4_32_10 generator.
430
+ *
431
+ * Return a single normally distributed double derived from a normal
432
+ * distribution with mean \p mean and standard deviation \p stddev
433
+ * from the Philox4_32_10 generator in \p state,
434
+ * increment position of generator.
435
+ *
436
+ * The implementation uses a Box-Muller transform to generate two
437
+ * normally distributed results, transforms them to log-normal distribution,
438
+ * then returns them one at a time.
439
+ * See ::curand_log_normal2_double() for a more efficient version that returns
440
+ * both results at once.
441
+ *
442
+ * \param state - Pointer to state to update
443
+ * \param mean - Mean of the related normal distribution
444
+ * \param stddev - Standard deviation of the related normal distribution
445
+ *
446
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
447
+ */
448
+
449
+ QUALIFIERS double curand_log_normal_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
450
+ {
451
+ if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
452
+ uint4 _x;
453
+ _x = curand4(state);
454
+ double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
455
+ state->boxmuller_extra_double = exp(mean + (stddev * v.y));
456
+ state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
457
+ return exp(mean + (stddev * v.x));
458
+ }
459
+ state->boxmuller_flag_double = 0;
460
+ return state->boxmuller_extra_double;
461
+ }
462
+
463
+
464
+ /**
465
+ * \brief Return two log-normally distributed doubles from an XORWOW generator.
466
+ *
467
+ * Return two log-normally distributed doubles derived from a normal
468
+ * distribution with mean \p mean and standard deviation \p stddev
469
+ * from the XORWOW generator in \p state,
470
+ * increment position of generator by two.
471
+ *
472
+ * The implementation uses a Box-Muller transform to generate two
473
+ * normally distributed results, and transforms them to log-normal distribution,.
474
+ *
475
+ * \param state - Pointer to state to update
476
+ * \param mean - Mean of the related normal distribution
477
+ * \param stddev - Standard deviation of the related normal distribution
478
+ *
479
+ * \return Log-normally distributed double2 where each element is from a
480
+ * distribution with mean \p mean and standard deviation \p stddev
481
+ */
482
+ QUALIFIERS double2 curand_log_normal2_double(curandStateXORWOW_t *state, double mean, double stddev)
483
+ {
484
+ double2 v = curand_box_muller_double(state);
485
+ v.x = exp(mean + (stddev * v.x));
486
+ v.y = exp(mean + (stddev * v.y));
487
+ return v;
488
+ }
489
+
490
+ /**
491
+ * \brief Return two log-normally distributed doubles from an Philox4_32_10 generator.
492
+ *
493
+ * Return two log-normally distributed doubles derived from a normal
494
+ * distribution with mean \p mean and standard deviation \p stddev
495
+ * from the Philox4_32_10 generator in \p state,
496
+ * increment position of generator by four.
497
+ *
498
+ * The implementation uses a Box-Muller transform to generate two
499
+ * normally distributed results, and transforms them to log-normal distribution,.
500
+ *
501
+ * \param state - Pointer to state to update
502
+ * \param mean - Mean of the related normal distribution
503
+ * \param stddev - Standard deviation of the related normal distribution
504
+ *
505
+ * \return Log-normally distributed double4 where each element is from a
506
+ * distribution with mean \p mean and standard deviation \p stddev
507
+ */
508
+ QUALIFIERS double2 curand_log_normal2_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
509
+ {
510
+ double2 v = curand_box_muller2_double(state);
511
+ v.x = exp(mean + (stddev * v.x));
512
+ v.y = exp(mean + (stddev * v.y));
513
+ return v;
514
+ }
515
+ // nor part of API
516
+ QUALIFIERS double4 curand_log_normal4_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
517
+ {
518
+ double4 v = curand_box_muller4_double(state);
519
+ v.x = exp(mean + (stddev * v.x));
520
+ v.y = exp(mean + (stddev * v.y));
521
+ v.z = exp(mean + (stddev * v.z));
522
+ v.w = exp(mean + (stddev * v.w));
523
+ return v;
524
+ }
525
+
526
+ /**
527
+ * \brief Return a log-normally distributed double from an MRG32k3a generator.
528
+ *
529
+ * Return a single normally distributed double derived from a normal
530
+ * distribution with mean \p mean and standard deviation \p stddev
531
+ * from the MRG32k3a generator in \p state,
532
+ * increment position of generator.
533
+ *
534
+ * The implementation uses a Box-Muller transform to generate two
535
+ * normally distributed results, transforms them to log-normal distribution,
536
+ * then returns them one at a time.
537
+ * See ::curand_log_normal2_double() for a more efficient version that returns
538
+ * both results at once.
539
+ *
540
+ * \param state - Pointer to state to update
541
+ * \param mean - Mean of the related normal distribution
542
+ * \param stddev - Standard deviation of the related normal distribution
543
+ *
544
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
545
+ */
546
+ QUALIFIERS double curand_log_normal_double(curandStateMRG32k3a_t *state, double mean, double stddev)
547
+ {
548
+ if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
549
+ double2 v = curand_box_muller_mrg_double(state);
550
+ state->boxmuller_extra_double = exp(mean + (stddev * v.y));
551
+ state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
552
+ return exp(mean + (stddev * v.x));
553
+ }
554
+ state->boxmuller_flag_double = 0;
555
+ return state->boxmuller_extra_double;
556
+ }
557
+
558
+ /**
559
+ * \brief Return two log-normally distributed doubles from an MRG32k3a generator.
560
+ *
561
+ * Return two log-normally distributed doubles derived from a normal
562
+ * distribution with mean \p mean and standard deviation \p stddev
563
+ * from the MRG32k3a generator in \p state,
564
+ * increment position of generator by two.
565
+ *
566
+ * The implementation uses a Box-Muller transform to generate two
567
+ * normally distributed results, and transforms them to log-normal distribution,.
568
+ *
569
+ * \param state - Pointer to state to update
570
+ * \param mean - Mean of the related normal distribution
571
+ * \param stddev - Standard deviation of the related normal distribution
572
+ *
573
+ * \return Log-normally distributed double2 where each element is from a
574
+ * distribution with mean \p mean and standard deviation \p stddev
575
+ */
576
+ QUALIFIERS double2 curand_log_normal2_double(curandStateMRG32k3a_t *state, double mean, double stddev)
577
+ {
578
+ double2 v = curand_box_muller_mrg_double(state);
579
+ v.x = exp(mean + (stddev * v.x));
580
+ v.y = exp(mean + (stddev * v.y));
581
+ return v;
582
+ }
583
+
584
+ /**
585
+ * \brief Return a log-normally distributed double from an MTGP32 generator.
586
+ *
587
+ * Return a single log-normally distributed double derived from a normal
588
+ * distribution with mean \p mean and standard deviation \p stddev
589
+ * from the MTGP32 generator in \p state,
590
+ * increment position of generator.
591
+ *
592
+ * The implementation uses the inverse cumulative distribution function
593
+ * to generate normally distributed results, and transforms them into
594
+ * log-normal distribution.
595
+ *
596
+ * \param state - Pointer to state to update
597
+ * \param mean - Mean of the related normal distribution
598
+ * \param stddev - Standard deviation of the related normal distribution
599
+ *
600
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
601
+ */
602
+ QUALIFIERS double curand_log_normal_double(curandStateMtgp32_t *state, double mean, double stddev)
603
+ {
604
+ return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
605
+ }
606
+
607
+ /**
608
+ * \brief Return a log-normally distributed double from a Sobol32 generator.
609
+ *
610
+ * Return a single log-normally distributed double derived from a normal
611
+ * distribution with mean \p mean and standard deviation \p stddev
612
+ * from the Sobol32 generator in \p state,
613
+ * increment position of generator by one.
614
+ *
615
+ * The implementation uses the inverse cumulative distribution function
616
+ * to generate normally distributed results, and transforms them into
617
+ * log-normal distribution.
618
+ *
619
+ * \param state - Pointer to state to update
620
+ * \param mean - Mean of the related normal distribution
621
+ * \param stddev - Standard deviation of the related normal distribution
622
+ *
623
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
624
+ */
625
+ QUALIFIERS double curand_log_normal_double(curandStateSobol32_t *state, double mean, double stddev)
626
+ {
627
+ return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
628
+ }
629
+
630
+ /**
631
+ * \brief Return a log-normally distributed double from a scrambled Sobol32 generator.
632
+ *
633
+ * Return a single log-normally distributed double derived from a normal
634
+ * distribution with mean \p mean and standard deviation \p stddev
635
+ * from the scrambled Sobol32 generator in \p state,
636
+ * increment position of generator by one.
637
+ *
638
+ * The implementation uses the inverse cumulative distribution function
639
+ * to generate normally distributed results, and transforms them into
640
+ * log-normal distribution.
641
+ *
642
+ * \param state - Pointer to state to update
643
+ * \param mean - Mean of the related normal distribution
644
+ * \param stddev - Standard deviation of the related normal distribution
645
+ *
646
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
647
+ */
648
+ QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol32_t *state, double mean, double stddev)
649
+ {
650
+ return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
651
+ }
652
+
653
+ /**
654
+ * \brief Return a log-normally distributed double from a Sobol64 generator.
655
+ *
656
+ * Return a single normally distributed double derived from a normal
657
+ * distribution with mean \p mean and standard deviation \p stddev
658
+ * from the Sobol64 generator in \p state,
659
+ * increment position of generator by one.
660
+ *
661
+ * The implementation uses the inverse cumulative distribution function
662
+ * to generate normally distributed results.
663
+ *
664
+ * \param state - Pointer to state to update
665
+ * \param mean - Mean of the related normal distribution
666
+ * \param stddev - Standard deviation of the related normal distribution
667
+ *
668
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
669
+ */
670
+ QUALIFIERS double curand_log_normal_double(curandStateSobol64_t *state, double mean, double stddev)
671
+ {
672
+ return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
673
+ }
674
+
675
+ /**
676
+ * \brief Return a log-normally distributed double from a scrambled Sobol64 generator.
677
+ *
678
+ * Return a single normally distributed double derived from a normal
679
+ * distribution with mean \p mean and standard deviation \p stddev
680
+ * from the scrambled Sobol64 generator in \p state,
681
+ * increment position of generator by one.
682
+ *
683
+ * The implementation uses the inverse cumulative distribution function
684
+ * to generate normally distributed results.
685
+ *
686
+ * \param state - Pointer to state to update
687
+ * \param mean - Mean of the related normal distribution
688
+ * \param stddev - Standard deviation of the related normal distribution
689
+ *
690
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
691
+ */
692
+ QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol64_t *state, double mean, double stddev)
693
+ {
694
+ return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
695
+ }
696
+
697
+ #endif // !defined(CURAND_LOGNORMAL_H_)
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mrg32k3a.h ADDED
The diff for this file is too large to render. See raw diff
 
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef CURAND_MTGP32_H
51
+ #define CURAND_MTGP32_H
52
+ /*
53
+ * @file curand_mtgp32.h
54
+ *
55
+ * @brief Mersenne Twister for Graphic Processors (mtgp32), which
56
+ * generates 32-bit unsigned integers and single precision floating
57
+ * point numbers based on IEEE 754 format.
58
+ *
59
+ * @author Mutsuo Saito (Hiroshima University)
60
+ * @author Makoto Matsumoto (Hiroshima University)
61
+ *
62
+ */
63
+ /*
64
+ * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
65
+ * University. All rights reserved.
66
+ * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
67
+ * University and University of Tokyo. All rights reserved.
68
+ *
69
+ * Redistribution and use in source and binary forms, with or without
70
+ * modification, are permitted provided that the following conditions are
71
+ * met:
72
+ *
73
+ * * Redistributions of source code must retain the above copyright
74
+ * notice, this list of conditions and the following disclaimer.
75
+ * * Redistributions in binary form must reproduce the above
76
+ * copyright notice, this list of conditions and the following
77
+ * disclaimer in the documentation and/or other materials provided
78
+ * with the distribution.
79
+ * * Neither the name of the Hiroshima University nor the names of
80
+ * its contributors may be used to endorse or promote products
81
+ * derived from this software without specific prior written
82
+ * permission.
83
+ *
84
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
85
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
86
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
87
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
88
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
89
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
90
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
91
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
92
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
93
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
94
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
95
+ */
96
+
97
+
98
+ #define MTGPDC_MEXP 11213
99
+ #define MTGPDC_N 351
100
+ #define MTGPDC_FLOOR_2P 256
101
+ #define MTGPDC_CEIL_2P 512
102
+ #define MTGPDC_PARAM_TABLE mtgp32dc_params_fast_11213
103
+ #define MTGP32_STATE_SIZE 1024
104
+ #define MTGP32_STATE_MASK 1023
105
+ #define CURAND_NUM_MTGP32_PARAMS 200
106
+ #define MEXP 11213
107
+ #define THREAD_NUM MTGPDC_FLOOR_2P
108
+ #define LARGE_SIZE (THREAD_NUM * 3)
109
+ #define TBL_SIZE 16
110
+
111
+ /**
112
+ * \addtogroup DEVICE Device API
113
+ *
114
+ * @{
115
+ */
116
+
117
+ /*
118
+ * \struct MTGP32_PARAMS_FAST_T
119
+ * MTGP32 parameters.
120
+ * Some element is redundant to keep structure simple.
121
+ *
122
+ * \b pos is a pick up position which is selected to have good
123
+ * performance on graphic processors. 3 < \b pos < Q, where Q is a
124
+ * maximum number such that the size of status array - Q is a power of
125
+ * 2. For example, when \b mexp is 44497, size of 32-bit status array
126
+ * is 696, and Q is 184, then \b pos is between 4 and 183. This means
127
+ * 512 parallel calculations is allowed when \b mexp is 44497.
128
+ *
129
+ * \b poly_sha1 is SHA1 digest of the characteristic polynomial of
130
+ * state transition function. SHA1 is calculated based on printing
131
+ * form of the polynomial. This is important when we use parameters
132
+ * generated by the dynamic creator which
133
+ *
134
+ * \b mask This is a mask to make the dimension of state space have
135
+ * just Mersenne Prime. This is redundant.
136
+ */
137
+
138
+ struct mtgp32_params_fast;
139
+
140
+ struct mtgp32_params_fast {
141
+ int mexp; /*< Mersenne exponent. This is redundant. */
142
+ int pos; /*< pick up position. */
143
+ int sh1; /*< shift value 1. 0 < sh1 < 32. */
144
+ int sh2; /*< shift value 2. 0 < sh2 < 32. */
145
+ unsigned int tbl[16]; /*< a small matrix. */
146
+ unsigned int tmp_tbl[16]; /*< a small matrix for tempering. */
147
+ unsigned int flt_tmp_tbl[16]; /*< a small matrix for tempering and
148
+ converting to float. */
149
+ unsigned int mask; /*< This is a mask for state space */
150
+ unsigned char poly_sha1[21]; /*< SHA1 digest */
151
+ };
152
+
153
+ /** \cond UNHIDE_TYPEDEFS */
154
+ typedef struct mtgp32_params_fast mtgp32_params_fast_t;
155
+ /** \endcond */
156
+
157
+ /*
158
+ * Generator Parameters.
159
+ */
160
+ struct mtgp32_kernel_params;
161
+ struct mtgp32_kernel_params {
162
+ unsigned int pos_tbl[CURAND_NUM_MTGP32_PARAMS];
163
+ unsigned int param_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
164
+ unsigned int temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
165
+ unsigned int single_temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
166
+ unsigned int sh1_tbl[CURAND_NUM_MTGP32_PARAMS];
167
+ unsigned int sh2_tbl[CURAND_NUM_MTGP32_PARAMS];
168
+ unsigned int mask[1];
169
+ };
170
+
171
+ /** \cond UNHIDE_TYPEDEFS */
172
+ typedef struct mtgp32_kernel_params mtgp32_kernel_params_t;
173
+ /** \endcond */
174
+
175
+
176
+
177
+ /*
178
+ * kernel I/O
179
+ * This structure must be initialized before first use.
180
+ */
181
+
182
+ /* MTGP (Mersenne Twister) RNG */
183
+ /* This generator uses the Mersenne Twister algorithm of
184
+ * http://arxiv.org/abs/1005.4973v2
185
+ * Has period 2^11213.
186
+ */
187
+
188
+ /**
189
+ * CURAND MTGP32 state
190
+ */
191
+ struct curandStateMtgp32;
192
+
193
+ struct curandStateMtgp32 {
194
+ unsigned int s[MTGP32_STATE_SIZE];
195
+ int offset;
196
+ int pIdx;
197
+ mtgp32_kernel_params_t * k;
198
+ };
199
+
200
+ /*
201
+ * CURAND MTGP32 state
202
+ */
203
+ /** \cond UNHIDE_TYPEDEFS */
204
+ typedef struct curandStateMtgp32 curandStateMtgp32_t;
205
+ /** \endcond */
206
+
207
+ /** @} */
208
+
209
+ #endif
210
+
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_host.h ADDED
@@ -0,0 +1,516 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * curand_mtgp32_host.h
52
+ *
53
+ *
54
+ * MTGP32-11213
55
+ *
56
+ * Mersenne Twister RNG for the GPU
57
+ *
58
+ * The period of generated integers is 2<sup>11213</sup>-1.
59
+ *
60
+ * This code generates 32-bit unsigned integers, and
61
+ * single precision floating point numbers uniformly distributed
62
+ * in the range [1, 2). (float r; 1.0 <= r < 2.0)
63
+ */
64
+
65
+ /*
66
+ * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
67
+ * University. All rights reserved.
68
+ * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
69
+ * University and University of Tokyo. All rights reserved.
70
+ *
71
+ * Redistribution and use in source and binary forms, with or without
72
+ * modification, are permitted provided that the following conditions are
73
+ * met:
74
+ *
75
+ * * Redistributions of source code must retain the above copyright
76
+ * notice, this list of conditions and the following disclaimer.
77
+ * * Redistributions in binary form must reproduce the above
78
+ * copyright notice, this list of conditions and the following
79
+ * disclaimer in the documentation and/or other materials provided
80
+ * with the distribution.
81
+ * * Neither the name of the Hiroshima University nor the names of
82
+ * its contributors may be used to endorse or promote products
83
+ * derived from this software without specific prior written
84
+ * permission.
85
+ *
86
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
87
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
88
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
89
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
90
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
91
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
92
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
93
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
94
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
95
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
96
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
97
+ */
98
+ #if !defined CURAND_MTGP32_HOST_H
99
+ #define CURAND_MTGP32_HOST_H
100
+
101
+ #if !defined(QUALIFIERS)
102
+ #define QUALIFIERS static inline __device__
103
+ #endif
104
+
105
+ #include <cuda_runtime.h>
106
+ #include <stdlib.h>
107
+ #include <memory.h>
108
+ #include <string.h>
109
+ #include "curand.h"
110
+ #include "curand_mtgp32.h"
111
+ #include "curand_mtgp32dc_p_11213.h"
112
+
113
+
114
+ /**
115
+ * \addtogroup DEVICE Device API
116
+ *
117
+ * @{
118
+ */
119
+
120
+ static const unsigned int non_zero = 0x4d544750;
121
+
122
+ /*
123
+ * This function represents a function used in the initialization
124
+ * by mtgp32_init_by_array() and mtgp32_init_by_str().
125
+ * @param[in] x 32-bit integer
126
+ * @return 32-bit integer
127
+ */
128
+ static __forceinline__ unsigned int ini_func1(unsigned int x) {
129
+ return (x ^ (x >> 27)) * (1664525);
130
+ }
131
+
132
+ /*
133
+ * This function represents a function used in the initialization
134
+ * by mtgp32_init_by_array() and mtgp32_init_by_str().
135
+ * @param[in] x 32-bit integer
136
+ * @return 32-bit integer
137
+ */
138
+ static __forceinline__ unsigned int ini_func2(unsigned int x) {
139
+ return (x ^ (x >> 27)) * (1566083941);
140
+ }
141
+
142
+ /*
143
+ * This function initializes the internal state array with a 32-bit
144
+ * integer seed. The allocated memory should be freed by calling
145
+ * mtgp32_free(). \b para should be one of the elements in the
146
+ * parameter table (mtgp32-param-ref.c).
147
+ *
148
+ * This function is call by cuda program, because cuda program uses
149
+ * another structure and another allocation method.
150
+ *
151
+ * @param[out] array MTGP internal status vector.
152
+ * @param[in] para parameter structure
153
+ * @param[in] seed a 32-bit integer used as the seed.
154
+ */
155
+ static __forceinline__ __host__
156
+ void mtgp32_init_state(unsigned int state[],
157
+ const mtgp32_params_fast_t *para, unsigned int seed) {
158
+ int i;
159
+ int size = para->mexp / 32 + 1;
160
+ unsigned int hidden_seed;
161
+ unsigned int tmp;
162
+ hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
163
+ tmp = hidden_seed;
164
+ tmp += tmp >> 16;
165
+ tmp += tmp >> 8;
166
+ memset(state, tmp & 0xff, sizeof(unsigned int) * size);
167
+ state[0] = seed;
168
+ state[1] = hidden_seed;
169
+ for (i = 1; i < size; i++) {
170
+ state[i] ^= (1812433253) * (state[i - 1] ^ (state[i - 1] >> 30)) + i;
171
+ }
172
+ }
173
+
174
+ /*
175
+ * This function initializes the internal state array
176
+ * with a 32-bit integer array. \b para should be one of the elements in
177
+ * the parameter table (mtgp32-param-ref.c).
178
+ *
179
+ * @param[out] mtgp32 MTGP structure.
180
+ * @param[in] para parameter structure
181
+ * @param[in] array a 32-bit integer array used as a seed.
182
+ * @param[in] length length of the array.
183
+ * @return CURAND_STATUS_SUCCESS
184
+ */
185
+ static __forceinline__ __host__
186
+ int mtgp32_init_by_array(unsigned int state[],
187
+ const mtgp32_params_fast_t *para,
188
+ unsigned int *array, int length) {
189
+ int i, j, count;
190
+ unsigned int r;
191
+ int lag;
192
+ int mid;
193
+ int size = para->mexp / 32 + 1;
194
+ unsigned int hidden_seed;
195
+ unsigned int tmp;
196
+
197
+ if (size >= 623) {
198
+ lag = 11;
199
+ } else if (size >= 68) {
200
+ lag = 7;
201
+ } else if (size >= 39) {
202
+ lag = 5;
203
+ } else {
204
+ lag = 3;
205
+ }
206
+ mid = (size - lag) / 2;
207
+
208
+ hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
209
+ tmp = hidden_seed;
210
+ tmp += tmp >> 16;
211
+ tmp += tmp >> 8;
212
+ memset(state, tmp & 0xff, sizeof(unsigned int) * size);
213
+ state[0] = hidden_seed;
214
+
215
+ if (length + 1 > size) {
216
+ count = length + 1;
217
+ } else {
218
+ count = size;
219
+ }
220
+ r = ini_func1(state[0] ^ state[mid] ^ state[size - 1]);
221
+ state[mid] += r;
222
+ r += length;
223
+ state[(mid + lag) % size] += r;
224
+ state[0] = r;
225
+ i = 1;
226
+ count--;
227
+ for (i = 1, j = 0; (j < count) && (j < length); j++) {
228
+ r = ini_func1(state[i] ^ state[(i + mid) % size]
229
+ ^ state[(i + size - 1) % size]);
230
+ state[(i + mid) % size] += r;
231
+ r += array[j] + i;
232
+ state[(i + mid + lag) % size] += r;
233
+ state[i] = r;
234
+ i = (i + 1) % size;
235
+ }
236
+ for (; j < count; j++) {
237
+ r = ini_func1(state[i] ^ state[(i + mid) % size]
238
+ ^ state[(i + size - 1) % size]);
239
+ state[(i + mid) % size] += r;
240
+ r += i;
241
+ state[(i + mid + lag) % size] += r;
242
+ state[i] = r;
243
+ i = (i + 1) % size;
244
+ }
245
+ for (j = 0; j < size; j++) {
246
+ r = ini_func2(state[i] + state[(i + mid) % size]
247
+ + state[(i + size - 1) % size]);
248
+ state[(i + mid) % size] ^= r;
249
+ r -= i;
250
+ state[(i + mid + lag) % size] ^= r;
251
+ state[i] = r;
252
+ i = (i + 1) % size;
253
+ }
254
+ if (state[size - 1] == 0) {
255
+ state[size - 1] = non_zero;
256
+ }
257
+ return 0;
258
+ }
259
+
260
+ /*
261
+ * This function initializes the internal state array
262
+ * with a character array. \b para should be one of the elements in
263
+ * the parameter table (mtgp32-param-ref.c).
264
+ * This is the same algorithm with mtgp32_init_by_array(), but hope to
265
+ * be more useful.
266
+ *
267
+ * @param[out] mtgp32 MTGP structure.
268
+ * @param[in] para parameter structure
269
+ * @param[in] array a character array used as a seed. (terminated by zero.)
270
+ * @return memory allocation result. if 0 then O.K.
271
+ */
272
+ static __forceinline__ __host__
273
+ int mtgp32_init_by_str(unsigned int state[],
274
+ const mtgp32_params_fast_t *para, unsigned char *array) {
275
+ int i, j, count;
276
+ unsigned int r;
277
+ int lag;
278
+ int mid;
279
+ int size = para->mexp / 32 + 1;
280
+ int length = (unsigned int)strlen((char *)array);
281
+ unsigned int hidden_seed;
282
+ unsigned int tmp;
283
+
284
+ if (size >= 623) {
285
+ lag = 11;
286
+ } else if (size >= 68) {
287
+ lag = 7;
288
+ } else if (size >= 39) {
289
+ lag = 5;
290
+ } else {
291
+ lag = 3;
292
+ }
293
+ mid = (size - lag) / 2;
294
+
295
+ hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
296
+ tmp = hidden_seed;
297
+ tmp += tmp >> 16;
298
+ tmp += tmp >> 8;
299
+ memset(state, tmp & 0xff, sizeof(unsigned int) * size);
300
+ state[0] = hidden_seed;
301
+
302
+ if (length + 1 > size) {
303
+ count = length + 1;
304
+ } else {
305
+ count = size;
306
+ }
307
+ r = ini_func1(state[0] ^ state[mid] ^ state[size - 1]);
308
+ state[mid] += r;
309
+ r += length;
310
+ state[(mid + lag) % size] += r;
311
+ state[0] = r;
312
+ i = 1;
313
+ count--;
314
+ for (i = 1, j = 0; (j < count) && (j < length); j++) {
315
+ r = ini_func1(state[i] ^ state[(i + mid) % size]
316
+ ^ state[(i + size - 1) % size]);
317
+ state[(i + mid) % size] += r;
318
+ r += array[j] + i;
319
+ state[(i + mid + lag) % size] += r;
320
+ state[i] = r;
321
+ i = (i + 1) % size;
322
+ }
323
+ for (; j < count; j++) {
324
+ r = ini_func1(state[i] ^ state[(i + mid) % size]
325
+ ^ state[(i + size - 1) % size]);
326
+ state[(i + mid) % size] += r;
327
+ r += i;
328
+ state[(i + mid + lag) % size] += r;
329
+ state[i] = r;
330
+ i = (i + 1) % size;
331
+ }
332
+ for (j = 0; j < size; j++) {
333
+ r = ini_func2(state[i] + state[(i + mid) % size]
334
+ + state[(i + size - 1) % size]);
335
+ state[(i + mid) % size] ^= r;
336
+ r -= i;
337
+ state[(i + mid + lag) % size] ^= r;
338
+ state[i] = r;
339
+ i = (i + 1) % size;
340
+ }
341
+ if (state[size - 1] == 0) {
342
+ state[size - 1] = non_zero;
343
+ }
344
+ return 0;
345
+ }
346
+
347
+ template<typename ParamsType>
348
+ static __forceinline__ __host__
349
+ curandStatus_t curandMakeMTGP32ConstantsImpl(const mtgp32_params_fast_t params[], ParamsType * p, const int block_num)
350
+ {
351
+ const int size1 = sizeof(unsigned int) * block_num;
352
+ const int size2 = sizeof(unsigned int) * block_num * TBL_SIZE;
353
+ unsigned int *h_pos_tbl;
354
+ unsigned int *h_sh1_tbl;
355
+ unsigned int *h_sh2_tbl;
356
+ unsigned int *h_param_tbl;
357
+ unsigned int *h_temper_tbl;
358
+ unsigned int *h_single_temper_tbl;
359
+ unsigned int *h_mask;
360
+ curandStatus_t status = CURAND_STATUS_SUCCESS;
361
+
362
+ h_pos_tbl = (unsigned int *)malloc(size1);
363
+ h_sh1_tbl = (unsigned int *)malloc(size1);
364
+ h_sh2_tbl = (unsigned int *)malloc(size1);
365
+ h_param_tbl = (unsigned int *)malloc(size2);
366
+ h_temper_tbl = (unsigned int *)malloc(size2);
367
+ h_single_temper_tbl = (unsigned int *)malloc(size2);
368
+ h_mask = (unsigned int *)malloc(sizeof(unsigned int));
369
+ if (h_pos_tbl == NULL
370
+ || h_sh1_tbl == NULL
371
+ || h_sh2_tbl == NULL
372
+ || h_param_tbl == NULL
373
+ || h_temper_tbl == NULL
374
+ || h_single_temper_tbl == NULL
375
+ || h_mask == NULL) {
376
+ if (h_pos_tbl != NULL) free(h_pos_tbl);
377
+ if (h_sh1_tbl != NULL) free(h_sh1_tbl);
378
+ if (h_sh2_tbl != NULL) free(h_sh2_tbl);
379
+ if (h_param_tbl != NULL) free(h_param_tbl);
380
+ if (h_temper_tbl != NULL) free(h_temper_tbl);
381
+ if (h_single_temper_tbl != NULL) free(h_single_temper_tbl);
382
+ if (h_mask != NULL) free(h_mask);
383
+ status = CURAND_STATUS_ALLOCATION_FAILED;
384
+ } else {
385
+
386
+ h_mask[0] = params[0].mask;
387
+ for (int i = 0; i < block_num; i++) {
388
+ h_pos_tbl[i] = params[i].pos;
389
+ h_sh1_tbl[i] = params[i].sh1;
390
+ h_sh2_tbl[i] = params[i].sh2;
391
+ for (int j = 0; j < TBL_SIZE; j++) {
392
+ h_param_tbl[i * TBL_SIZE + j] = params[i].tbl[j];
393
+ h_temper_tbl[i * TBL_SIZE + j] = params[i].tmp_tbl[j];
394
+ h_single_temper_tbl[i * TBL_SIZE + j] = params[i].flt_tmp_tbl[j];
395
+ }
396
+ }
397
+ if (cudaMemcpy( p->pos_tbl,
398
+ h_pos_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
399
+ {
400
+ status = CURAND_STATUS_INITIALIZATION_FAILED;
401
+ } else
402
+ if (cudaMemcpy( p->sh1_tbl,
403
+ h_sh1_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
404
+ {
405
+ status = CURAND_STATUS_INITIALIZATION_FAILED;
406
+ } else
407
+ if (cudaMemcpy( p->sh2_tbl,
408
+ h_sh2_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
409
+ {
410
+ status = CURAND_STATUS_INITIALIZATION_FAILED;
411
+ } else
412
+ if (cudaMemcpy( p->param_tbl,
413
+ h_param_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
414
+ {
415
+ status = CURAND_STATUS_INITIALIZATION_FAILED;
416
+ } else
417
+ if (cudaMemcpy( p->temper_tbl,
418
+ h_temper_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
419
+ {
420
+ status = CURAND_STATUS_INITIALIZATION_FAILED;
421
+ } else
422
+ if (cudaMemcpy( p->single_temper_tbl,
423
+ h_single_temper_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
424
+ {
425
+ status = CURAND_STATUS_INITIALIZATION_FAILED;
426
+ } else
427
+ if (cudaMemcpy( p->mask,
428
+ h_mask, sizeof(unsigned int), cudaMemcpyHostToDevice) != cudaSuccess)
429
+ {
430
+ status = CURAND_STATUS_INITIALIZATION_FAILED;
431
+ }
432
+ }
433
+ if (h_pos_tbl != NULL) free(h_pos_tbl);
434
+ if (h_sh1_tbl != NULL) free(h_sh1_tbl);
435
+ if (h_sh2_tbl != NULL) free(h_sh2_tbl);
436
+ if (h_param_tbl != NULL) free(h_param_tbl);
437
+ if (h_temper_tbl != NULL) free(h_temper_tbl);
438
+ if (h_single_temper_tbl != NULL)free(h_single_temper_tbl);
439
+ if (h_mask != NULL) free(h_mask);
440
+ return status;
441
+ }
442
+
443
+ /**
444
+ * \brief Set up constant parameters for the mtgp32 generator
445
+ *
446
+ * This host-side helper function re-organizes CURAND_NUM_MTGP32_PARAMS sets of
447
+ * generator parameters for use by kernel functions and copies the
448
+ * result to the specified location in device memory.
449
+ *
450
+ * \param params - Pointer to an array of type mtgp32_params_fast_t in host memory
451
+ * \param p - pointer to a structure of type mtgp32_kernel_params_t in device memory.
452
+ *
453
+ * \return
454
+ * - CURAND_STATUS_ALLOCATION_FAILED if host memory could not be allocated
455
+ * - CURAND_STATUS_INITIALIZATION_FAILED if the copy to device memory failed
456
+ * - CURAND_STATUS_SUCCESS otherwise
457
+ */
458
+ static __forceinline__ __host__
459
+ curandStatus_t curandMakeMTGP32Constants(const mtgp32_params_fast_t params[], mtgp32_kernel_params_t * p)
460
+ {
461
+ return curandMakeMTGP32ConstantsImpl(params, p, CURAND_NUM_MTGP32_PARAMS);
462
+ }
463
+
464
+ /**
465
+ * \brief Set up initial states for the mtgp32 generator
466
+ *
467
+ * This host-side helper function initializes a number of states (one parameter set per state) for
468
+ * an mtgp32 generator. To accomplish this it allocates a state array in host memory,
469
+ * initializes that array, and copies the result to device memory.
470
+ *
471
+ * \param s - pointer to an array of states in device memory
472
+ * \param params - Pointer to an array of type mtgp32_params_fast_t in host memory
473
+ * \param k - pointer to a structure of type mtgp32_kernel_params_t in device memory
474
+ * \param n - number of parameter sets/states to initialize
475
+ * \param seed - seed value
476
+ *
477
+ * \return
478
+ * - CURAND_STATUS_ALLOCATION_FAILED if host memory state could not be allocated
479
+ * - CURAND_STATUS_INITIALIZATION_FAILED if the copy to device memory failed
480
+ * - CURAND_STATUS_SUCCESS otherwise
481
+ */
482
+ static __forceinline__ __host__
483
+ curandStatus_t CURANDAPI curandMakeMTGP32KernelState(curandStateMtgp32_t *s,
484
+ mtgp32_params_fast_t params[],
485
+ mtgp32_kernel_params_t *k,
486
+ int n,
487
+ unsigned long long seed)
488
+ {
489
+ int i;
490
+ curandStatus_t status = CURAND_STATUS_SUCCESS;
491
+ curandStateMtgp32_t *h_status =(curandStateMtgp32_t *) malloc(sizeof(curandStateMtgp32_t) * n);
492
+ if (h_status == NULL) {
493
+ status = CURAND_STATUS_ALLOCATION_FAILED;
494
+ } else {
495
+ seed = seed ^ (seed >> 32);
496
+ for (i = 0; i < n; i++) {
497
+ mtgp32_init_state(&(h_status[i].s[0]), &params[i],(unsigned int)seed + i + 1);
498
+ h_status[i].offset = 0;
499
+ h_status[i].pIdx = i;
500
+ h_status[i].k = k;
501
+ }
502
+ if (cudaMemcpy(s, h_status,
503
+ sizeof(curandStateMtgp32_t) * n,
504
+ cudaMemcpyHostToDevice) != cudaSuccess) {
505
+ status = CURAND_STATUS_INITIALIZATION_FAILED;
506
+ }
507
+ }
508
+ free(h_status);
509
+ return status;
510
+ }
511
+
512
+ /** @} */
513
+
514
+ #endif
515
+
516
+
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h ADDED
@@ -0,0 +1,386 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * curand_mtgp32_kernel.h
52
+ *
53
+ *
54
+ * MTGP32-11213
55
+ *
56
+ * Mersenne Twister RNG for the GPU
57
+ *
58
+ * The period of generated integers is 2<sup>11213</sup>-1.
59
+ *
60
+ * This code generates 32-bit unsigned integers, and
61
+ * single precision floating point numbers uniformly distributed
62
+ * in the range [1, 2). (float r; 1.0 <= r < 2.0)
63
+ */
64
+
65
+ /*
66
+ * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
67
+ * University. All rights reserved.
68
+ * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
69
+ * University and University of Tokyo. All rights reserved.
70
+ *
71
+ * Redistribution and use in source and binary forms, with or without
72
+ * modification, are permitted provided that the following conditions are
73
+ * met:
74
+ *
75
+ * * Redistributions of source code must retain the above copyright
76
+ * notice, this list of conditions and the following disclaimer.
77
+ * * Redistributions in binary form must reproduce the above
78
+ * copyright notice, this list of conditions and the following
79
+ * disclaimer in the documentation and/or other materials provided
80
+ * with the distribution.
81
+ * * Neither the name of the Hiroshima University nor the names of
82
+ * its contributors may be used to endorse or promote products
83
+ * derived from this software without specific prior written
84
+ * permission.
85
+ *
86
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
87
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
88
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
89
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
90
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
91
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
92
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
93
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
94
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
95
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
96
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
97
+ */
98
+ #if !defined CURAND_MTGP32_KERNEL_H
99
+ #define CURAND_MTGP32_KERNEL_H
100
+
101
+ #if !defined(QUALIFIERS)
102
+ #define QUALIFIERS static __forceinline__ __device__
103
+ #endif
104
+
105
+ #ifndef __CUDACC_RTC__
106
+ #include <cuda_runtime.h>
107
+ #include <stdlib.h>
108
+ #include <memory.h>
109
+ #include <string.h>
110
+ #endif // ifndef __CUDACC_RTC__
111
+ #include <nv/target>
112
+ #include "curand.h"
113
+ #include "curand_mtgp32.h"
114
+
115
+ /**
116
+ * \addtogroup DEVICE Device API
117
+ *
118
+ * @{
119
+ */
120
+
121
+ #ifndef __CUDA_ARCH__
122
+ // define blockDim and threadIdx for host compatibility call
123
+ extern const dim3 blockDim;
124
+ extern const uint3 threadIdx;
125
+ #endif
126
+
127
+
128
+ /*
129
+ * The function of the recursion formula calculation.
130
+ *
131
+ * @param[in] X1 the farthest part of state array.
132
+ * @param[in] X2 the second farthest part of state array.
133
+ * @param[in] Y a part of state array.
134
+ * @param[in] bid block id.
135
+ * @return output
136
+ */
137
+ QUALIFIERS unsigned int para_rec(mtgp32_kernel_params_t * k,unsigned int X1, unsigned int X2, unsigned int Y, int bid) {
138
+ unsigned int X = (X1 & k->mask[0]) ^ X2;
139
+ unsigned int MAT;
140
+
141
+ X ^= X << k->sh1_tbl[bid];
142
+ Y = X ^ (Y >> k->sh2_tbl[bid]);
143
+ MAT = k->param_tbl[bid][Y & 0x0f];
144
+ return Y ^ MAT;
145
+ }
146
+
147
+ /*
148
+ * The tempering function.
149
+ *
150
+ * @param[in] V the output value should be tempered.
151
+ * @param[in] T the tempering helper value.
152
+ * @param[in] bid block id.
153
+ * @return the tempered value.
154
+ */
155
+ QUALIFIERS unsigned int temper(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
156
+ unsigned int MAT;
157
+
158
+ T ^= T >> 16;
159
+ T ^= T >> 8;
160
+ MAT = k->temper_tbl[bid][T & 0x0f];
161
+ return V ^ MAT;
162
+ }
163
+
164
+ /*
165
+ * The tempering and converting function.
166
+ * By using the preset table, converting to IEEE format
167
+ * and tempering are done simultaneously.
168
+ *
169
+ * @param[in] V the output value should be tempered.
170
+ * @param[in] T the tempering helper value.
171
+ * @param[in] bid block id.
172
+ * @return the tempered and converted value.
173
+ */
174
+ QUALIFIERS unsigned int temper_single(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
175
+ unsigned int MAT;
176
+ unsigned int r;
177
+
178
+ T ^= T >> 16;
179
+ T ^= T >> 8;
180
+ MAT = k->single_temper_tbl[bid][T & 0x0f];
181
+ r = (V >> 9) ^ MAT;
182
+ return r;
183
+ }
184
+
185
+ /**
186
+ * \brief Return 32-bits of pseudorandomness from a mtgp32 generator.
187
+ *
188
+ * Return 32-bits of pseudorandomness from the mtgp32 generator in \p state,
189
+ * increment position of generator by the number of threads in the block.
190
+ * Note the number of threads in the block can not exceed 256.
191
+ *
192
+ * \param state - Pointer to state to update
193
+ *
194
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
195
+ */
196
+ QUALIFIERS unsigned int curand(curandStateMtgp32_t *state)
197
+ {
198
+ unsigned int t;
199
+ unsigned int d;
200
+ int pos = state->k->pos_tbl[state->pIdx];
201
+ unsigned int r;
202
+ unsigned int o;
203
+
204
+ d = blockDim.z * blockDim.y * blockDim.x;
205
+ //assert( d <= 256 );
206
+ t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x;
207
+ r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
208
+ state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
209
+ state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
210
+ state->pIdx);
211
+
212
+ state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
213
+ o = temper(state->k, r,
214
+ state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
215
+ state->pIdx);
216
+ NV_IF_TARGET(NV_IS_DEVICE,
217
+ __syncthreads();
218
+ )
219
+ if (t == 0)
220
+ {
221
+ state->offset = (state->offset + d) & MTGP32_STATE_MASK;
222
+ }
223
+ NV_IF_TARGET(NV_IS_DEVICE,
224
+ __syncthreads();
225
+ )
226
+ return o;
227
+
228
+ }
229
+ /**
230
+ * \brief Return 32-bits of pseudorandomness from a specific position in a mtgp32 generator.
231
+ *
232
+ * Return 32-bits of pseudorandomness from position \p index of the mtgp32 generator in \p state,
233
+ * increment position of generator by \p n positions, which must be the total number of positions
234
+ * upddated in the state by the thread block, for this invocation.
235
+ *
236
+ * Note :
237
+ * Thread indices must range from 0...\ n - 1.
238
+ * The number of positions updated may not exceed 256.
239
+ * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
240
+ *
241
+ * \param state - Pointer to state to update
242
+ * \param index - Index (0..255) of the position within the state to draw from and update
243
+ * \param n - The total number of postions in this state that are being updated by this invocation
244
+ *
245
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
246
+ */
247
+ QUALIFIERS unsigned int curand_mtgp32_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
248
+ {
249
+ unsigned int t;
250
+ int pos = state->k->pos_tbl[state->pIdx];
251
+ unsigned int r;
252
+ unsigned int o;
253
+
254
+ t = index;
255
+ r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
256
+ state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
257
+ state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
258
+ state->pIdx);
259
+
260
+ state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
261
+ o = temper(state->k, r,
262
+ state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
263
+ state->pIdx);
264
+ NV_IF_TARGET(NV_IS_DEVICE,
265
+ __syncthreads();
266
+ )
267
+ if (index == 0)
268
+ {
269
+ state->offset = (state->offset + n) & MTGP32_STATE_MASK;
270
+ }
271
+ NV_IF_TARGET(NV_IS_DEVICE,
272
+ __syncthreads();
273
+ )
274
+ return o;
275
+ }
276
+ /**
277
+ * \brief Return a uniformly distributed float from a mtgp32 generator.
278
+ *
279
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
280
+ * from the mtgp32 generator in \p state, increment position of generator.
281
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
282
+ * point outputs are never returned.
283
+ *
284
+ * Note: This alternate derivation of a uniform float is provided for completeness
285
+ * with the original source
286
+ *
287
+ * \param state - Pointer to state to update
288
+ *
289
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
290
+ */
291
+ QUALIFIERS float curand_mtgp32_single(curandStateMtgp32_t *state)
292
+ {
293
+ unsigned int t;
294
+ unsigned int d;
295
+ int pos = state->k->pos_tbl[state->pIdx];
296
+ unsigned int r;
297
+ unsigned int o_u;
298
+ float o_f;
299
+
300
+
301
+ t = blockDim.z * blockDim.y;
302
+ d = t * blockDim.x;
303
+ //assert( d <= 256 );
304
+ t += threadIdx.x;
305
+ r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
306
+ state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
307
+ state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
308
+ state->pIdx);
309
+
310
+ state->s[t] = r;
311
+ o_u = temper_single(state->k, r,
312
+ state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
313
+ state->pIdx);
314
+ NV_IF_TARGET(NV_IS_DEVICE,
315
+ __syncthreads();
316
+ )
317
+ if (threadIdx.x == 0)
318
+ {
319
+ state->offset = (state->offset + d) & MTGP32_STATE_MASK;
320
+ }
321
+ NV_IF_TARGET(NV_IS_DEVICE,
322
+ __syncthreads();
323
+ )
324
+ memcpy(&o_f, &o_u, sizeof(o_u));
325
+ return o_f;
326
+ }
327
+
328
+ /**
329
+ * \brief Return a uniformly distributed float from a specific position in a mtgp32 generator.
330
+ *
331
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
332
+ * from position \p index of the mtgp32 generator in \p state, and
333
+ * increment position of generator by \p n positions, which must be the total number of positions
334
+ * upddated in the state by the thread block, for this invocation.
335
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
336
+ * point outputs are never returned.
337
+ *
338
+ * Note 1:
339
+ * Thread indices must range from 0...\p n - 1.
340
+ * The number of positions updated may not exceed 256.
341
+ * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
342
+ *
343
+ * Note 2: This alternate derivation of a uniform float is provided for completeness
344
+ * with the original source
345
+ *
346
+ * \param state - Pointer to state to update
347
+ * \param index - Index (0..255) of the position within the state to draw from and update
348
+ * \param n - The total number of postions in this state that are being updated by this invocation
349
+ *
350
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
351
+ */
352
+ QUALIFIERS float curand_mtgp32_single_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
353
+ {
354
+ unsigned int t;
355
+ int pos = state->k->pos_tbl[state->pIdx];
356
+ unsigned int r;
357
+ unsigned int o_u;
358
+ float o_f;
359
+
360
+ t = index;
361
+ r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
362
+ state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
363
+ state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
364
+ state->pIdx);
365
+
366
+ state->s[t] = r;
367
+ o_u = temper_single(state->k, r,
368
+ state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
369
+ state->pIdx);
370
+ NV_IF_TARGET(NV_IS_DEVICE,
371
+ __syncthreads();
372
+ )
373
+ if (threadIdx.x == 0)
374
+ {
375
+ state->offset = (state->offset + n) & MTGP32_STATE_MASK;
376
+ }
377
+ NV_IF_TARGET(NV_IS_DEVICE,
378
+ __syncthreads();
379
+ )
380
+ memcpy(&o_f, &o_u, sizeof(o_u));
381
+ return o_f;
382
+ }
383
+
384
+ /** @} */
385
+
386
+ #endif
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32dc_p_11213.h ADDED
The diff for this file is too large to render. See raw diff
 
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h ADDED
@@ -0,0 +1,840 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * The source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * The Licensed Deliverables contained herein are PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+
51
+ #if !defined(CURAND_NORMAL_H_)
52
+ #define CURAND_NORMAL_H_
53
+
54
+ /**
55
+ * \defgroup DEVICE Device API
56
+ *
57
+ * @{
58
+ */
59
+
60
+ #ifndef __CUDACC_RTC__
61
+ #include <math.h>
62
+ #endif // __CUDACC_RTC__
63
+ #include <nv/target>
64
+
65
+ #include "curand_mrg32k3a.h"
66
+ #include "curand_mtgp32_kernel.h"
67
+ #include "curand_philox4x32_x.h"
68
+ #include "curand_normal_static.h"
69
+
70
+ QUALIFIERS float2 _curand_box_muller(unsigned int x, unsigned int y)
71
+ {
72
+ float2 result;
73
+ float u = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2);
74
+ float v = y * CURAND_2POW32_INV_2PI + (CURAND_2POW32_INV_2PI/2);
75
+ float s;
76
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
77
+ s = sqrtf(-2.0f * logf(u));
78
+ __sincosf(v, &result.x, &result.y);
79
+ ,
80
+ s = sqrtf(-2.0f * logf(u));
81
+ result.x = sinf(v);
82
+ result.y = cosf(v);
83
+ )
84
+ result.x *= s;
85
+ result.y *= s;
86
+ return result;
87
+ }
88
+
89
+ QUALIFIERS float2 curand_box_muller_mrg(curandStateMRG32k3a_t * state)
90
+ {
91
+ float x, y;
92
+ x = curand_uniform(state);
93
+ y = curand_uniform(state) * CURAND_2PI;
94
+ float2 result;
95
+ float s;
96
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
97
+ s = sqrtf(-2.0f * logf(x));
98
+ __sincosf(y, &result.x, &result.y);
99
+ ,
100
+ s = sqrtf(-2.0f * logf(x));
101
+ result.x = sinf(y);
102
+ result.y = cosf(y);
103
+ )
104
+ result.x *= s;
105
+ result.y *= s;
106
+ return result;
107
+ }
108
+
109
+ QUALIFIERS double2
110
+ _curand_box_muller_double(unsigned int x0, unsigned int x1,
111
+ unsigned int y0, unsigned int y1)
112
+ {
113
+ double2 result;
114
+ unsigned long long zx = (unsigned long long)x0 ^
115
+ ((unsigned long long)x1 << (53 - 32));
116
+ double u = zx * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
117
+ unsigned long long zy = (unsigned long long)y0 ^
118
+ ((unsigned long long)y1 << (53 - 32));
119
+ double v = zy * (CURAND_2POW53_INV_DOUBLE*2.0) + CURAND_2POW53_INV_DOUBLE;
120
+ double s = sqrt(-2.0 * log(u));
121
+
122
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
123
+ sincospi(v, &result.x, &result.y);
124
+ ,
125
+ result.x = sin(v*CURAND_PI_DOUBLE);
126
+ result.y = cos(v*CURAND_PI_DOUBLE);
127
+ )
128
+ result.x *= s;
129
+ result.y *= s;
130
+
131
+ return result;
132
+ }
133
+
134
+ QUALIFIERS double2
135
+ curand_box_muller_mrg_double(curandStateMRG32k3a_t * state)
136
+ {
137
+ double x, y;
138
+ double2 result;
139
+ x = curand_uniform_double(state);
140
+ y = curand_uniform_double(state) * 2.0;
141
+
142
+ double s = sqrt(-2.0 * log(x));
143
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
144
+ sincospi(y, &result.x, &result.y);
145
+ ,
146
+ result.x = sin(y*CURAND_PI_DOUBLE);
147
+ result.y = cos(y*CURAND_PI_DOUBLE);
148
+ )
149
+ result.x *= s;
150
+ result.y *= s;
151
+ return result;
152
+ }
153
+
154
+ template <typename R>
155
+ QUALIFIERS float2 curand_box_muller(R *state)
156
+ {
157
+ float2 result;
158
+ unsigned int x = curand(state);
159
+ unsigned int y = curand(state);
160
+ result = _curand_box_muller(x, y);
161
+ return result;
162
+ }
163
+
164
+ template <typename R>
165
+ QUALIFIERS float4 curand_box_muller4(R *state)
166
+ {
167
+ float4 result;
168
+ float2 _result;
169
+ uint4 x = curand4(state);
170
+ //unsigned int y = curand(state);
171
+ _result = _curand_box_muller(x.x, x.y);
172
+ result.x = _result.x;
173
+ result.y = _result.y;
174
+ _result = _curand_box_muller(x.z, x.w);
175
+ result.z = _result.x;
176
+ result.w = _result.y;
177
+ return result;
178
+ }
179
+
180
+ template <typename R>
181
+ QUALIFIERS double2 curand_box_muller_double(R *state)
182
+ {
183
+ double2 result;
184
+ unsigned int x0 = curand(state);
185
+ unsigned int x1 = curand(state);
186
+ unsigned int y0 = curand(state);
187
+ unsigned int y1 = curand(state);
188
+ result = _curand_box_muller_double(x0, x1, y0, y1);
189
+ return result;
190
+ }
191
+
192
+ template <typename R>
193
+ QUALIFIERS double2 curand_box_muller2_double(R *state)
194
+ {
195
+ double2 result;
196
+ uint4 _x;
197
+ _x = curand4(state);
198
+ result = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
199
+ return result;
200
+ }
201
+
202
+
203
+ template <typename R>
204
+ QUALIFIERS double4 curand_box_muller4_double(R *state)
205
+ {
206
+ double4 result;
207
+ double2 _res1;
208
+ double2 _res2;
209
+ uint4 _x;
210
+ uint4 _y;
211
+ _x = curand4(state);
212
+ _y = curand4(state);
213
+ _res1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
214
+ _res2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
215
+ result.x = _res1.x;
216
+ result.y = _res1.y;
217
+ result.z = _res2.x;
218
+ result.w = _res2.y;
219
+ return result;
220
+ }
221
+
222
+ //QUALIFIERS float _curand_normal_icdf(unsigned int x)
223
+ //{
224
+ //#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
225
+ // float s = CURAND_SQRT2;
226
+ // // Mirror to avoid loss of precision
227
+ // if(x > 0x80000000UL) {
228
+ // x = 0xffffffffUL - x;
229
+ // s = -s;
230
+ // }
231
+ // float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
232
+ // // p is in (0, 0.5], 2p is in (0, 1]
233
+ // return s * erfcinvf(2.0f * p);
234
+ //#else
235
+ // x++; //suppress warnings
236
+ // return 0.0f;
237
+ //#endif
238
+ //}
239
+ //
240
+ //QUALIFIERS float _curand_normal_icdf(unsigned long long x)
241
+ //{
242
+ //#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
243
+ // unsigned int t = (unsigned int)(x >> 32);
244
+ // float s = CURAND_SQRT2;
245
+ // // Mirror to avoid loss of precision
246
+ // if(t > 0x80000000UL) {
247
+ // t = 0xffffffffUL - t;
248
+ // s = -s;
249
+ // }
250
+ // float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
251
+ // // p is in (0, 0.5], 2p is in (0, 1]
252
+ // return s * erfcinvf(2.0f * p);
253
+ //#else
254
+ // x++;
255
+ // return 0.0f;
256
+ //#endif
257
+ //}
258
+ //
259
+ //QUALIFIERS double _curand_normal_icdf_double(unsigned int x)
260
+ //{
261
+ //#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
262
+ // double s = CURAND_SQRT2_DOUBLE;
263
+ // // Mirror to avoid loss of precision
264
+ // if(x > 0x80000000UL) {
265
+ // x = 0xffffffffUL - x;
266
+ // s = -s;
267
+ // }
268
+ // double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
269
+ // // p is in (0, 0.5], 2p is in (0, 1]
270
+ // return s * erfcinv(2.0 * p);
271
+ //#else
272
+ // x++;
273
+ // return 0.0;
274
+ //#endif
275
+ //}
276
+ //
277
+ //QUALIFIERS double _curand_normal_icdf_double(unsigned long long x)
278
+ //{
279
+ //#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
280
+ // double s = CURAND_SQRT2_DOUBLE;
281
+ // x >>= 11;
282
+ // // Mirror to avoid loss of precision
283
+ // if(x > 0x10000000000000UL) {
284
+ // x = 0x1fffffffffffffUL - x;
285
+ // s = -s;
286
+ // }
287
+ // double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
288
+ // // p is in (0, 0.5], 2p is in (0, 1]
289
+ // return s * erfcinv(2.0 * p);
290
+ //#else
291
+ // x++;
292
+ // return 0.0;
293
+ //#endif
294
+ //}
295
+ //
296
+
297
+ /**
298
+ * \brief Return a normally distributed float from an XORWOW generator.
299
+ *
300
+ * Return a single normally distributed float with mean \p 0.0f and
301
+ * standard deviation \p 1.0f from the XORWOW generator in \p state,
302
+ * increment position of generator by one.
303
+ *
304
+ * The implementation uses a Box-Muller transform to generate two
305
+ * normally distributed results, then returns them one at a time.
306
+ * See ::curand_normal2() for a more efficient version that returns
307
+ * both results at once.
308
+ *
309
+ * \param state - Pointer to state to update
310
+ *
311
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
312
+ */
313
+ QUALIFIERS float curand_normal(curandStateXORWOW_t *state)
314
+ {
315
+ if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
316
+ unsigned int x, y;
317
+ x = curand(state);
318
+ y = curand(state);
319
+ float2 v = _curand_box_muller(x, y);
320
+ state->boxmuller_extra = v.y;
321
+ state->boxmuller_flag = EXTRA_FLAG_NORMAL;
322
+ return v.x;
323
+ }
324
+ state->boxmuller_flag = 0;
325
+ return state->boxmuller_extra;
326
+ }
327
+
328
+ /**
329
+ * \brief Return a normally distributed float from an Philox4_32_10 generator.
330
+ *
331
+ * Return a single normally distributed float with mean \p 0.0f and
332
+ * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
333
+ * increment position of generator by one.
334
+ *
335
+ * The implementation uses a Box-Muller transform to generate two
336
+ * normally distributed results, then returns them one at a time.
337
+ * See ::curand_normal2() for a more efficient version that returns
338
+ * both results at once.
339
+ *
340
+ * \param state - Pointer to state to update
341
+ *
342
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
343
+ */
344
+
345
+ QUALIFIERS float curand_normal(curandStatePhilox4_32_10_t *state)
346
+ {
347
+ if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
348
+ unsigned int x, y;
349
+ x = curand(state);
350
+ y = curand(state);
351
+ float2 v = _curand_box_muller(x, y);
352
+ state->boxmuller_extra = v.y;
353
+ state->boxmuller_flag = EXTRA_FLAG_NORMAL;
354
+ return v.x;
355
+ }
356
+ state->boxmuller_flag = 0;
357
+ return state->boxmuller_extra;
358
+ }
359
+
360
+
361
+
362
+ /**
363
+ * \brief Return a normally distributed float from an MRG32k3a generator.
364
+ *
365
+ * Return a single normally distributed float with mean \p 0.0f and
366
+ * standard deviation \p 1.0f from the MRG32k3a generator in \p state,
367
+ * increment position of generator by one.
368
+ *
369
+ * The implementation uses a Box-Muller transform to generate two
370
+ * normally distributed results, then returns them one at a time.
371
+ * See ::curand_normal2() for a more efficient version that returns
372
+ * both results at once.
373
+ *
374
+ * \param state - Pointer to state to update
375
+ *
376
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
377
+ */
378
+ QUALIFIERS float curand_normal(curandStateMRG32k3a_t *state)
379
+ {
380
+ if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
381
+ float2 v = curand_box_muller_mrg(state);
382
+ state->boxmuller_extra = v.y;
383
+ state->boxmuller_flag = EXTRA_FLAG_NORMAL;
384
+ return v.x;
385
+ }
386
+ state->boxmuller_flag = 0;
387
+ return state->boxmuller_extra;
388
+ }
389
+
390
+ /**
391
+ * \brief Return two normally distributed floats from an XORWOW generator.
392
+ *
393
+ * Return two normally distributed floats with mean \p 0.0f and
394
+ * standard deviation \p 1.0f from the XORWOW generator in \p state,
395
+ * increment position of generator by two.
396
+ *
397
+ * The implementation uses a Box-Muller transform to generate two
398
+ * normally distributed results.
399
+ *
400
+ * \param state - Pointer to state to update
401
+ *
402
+ * \return Normally distributed float2 where each element is from a
403
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
404
+ */
405
+ QUALIFIERS float2 curand_normal2(curandStateXORWOW_t *state)
406
+ {
407
+ return curand_box_muller(state);
408
+ }
409
+ /**
410
+ * \brief Return two normally distributed floats from an Philox4_32_10 generator.
411
+ *
412
+ * Return two normally distributed floats with mean \p 0.0f and
413
+ * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
414
+ * increment position of generator by two.
415
+ *
416
+ * The implementation uses a Box-Muller transform to generate two
417
+ * normally distributed results.
418
+ *
419
+ * \param state - Pointer to state to update
420
+ *
421
+ * \return Normally distributed float2 where each element is from a
422
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
423
+ */
424
+ QUALIFIERS float2 curand_normal2(curandStatePhilox4_32_10_t *state)
425
+ {
426
+ return curand_box_muller(state);
427
+ }
428
+
429
+ /**
430
+ * \brief Return four normally distributed floats from an Philox4_32_10 generator.
431
+ *
432
+ * Return four normally distributed floats with mean \p 0.0f and
433
+ * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
434
+ * increment position of generator by four.
435
+ *
436
+ * The implementation uses a Box-Muller transform to generate two
437
+ * normally distributed results.
438
+ *
439
+ * \param state - Pointer to state to update
440
+ *
441
+ * \return Normally distributed float2 where each element is from a
442
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
443
+ */
444
+ QUALIFIERS float4 curand_normal4(curandStatePhilox4_32_10_t *state)
445
+ {
446
+ return curand_box_muller4(state);
447
+ }
448
+
449
+
450
+
451
+ /**
452
+ * \brief Return two normally distributed floats from an MRG32k3a generator.
453
+ *
454
+ * Return two normally distributed floats with mean \p 0.0f and
455
+ * standard deviation \p 1.0f from the MRG32k3a generator in \p state,
456
+ * increment position of generator by two.
457
+ *
458
+ * The implementation uses a Box-Muller transform to generate two
459
+ * normally distributed results.
460
+ *
461
+ * \param state - Pointer to state to update
462
+ *
463
+ * \return Normally distributed float2 where each element is from a
464
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
465
+ */
466
+ QUALIFIERS float2 curand_normal2(curandStateMRG32k3a_t *state)
467
+ {
468
+ return curand_box_muller_mrg(state);
469
+ }
470
+
471
+ /**
472
+ * \brief Return a normally distributed float from a MTGP32 generator.
473
+ *
474
+ * Return a single normally distributed float with mean \p 0.0f and
475
+ * standard deviation \p 1.0f from the MTGP32 generator in \p state,
476
+ * increment position of generator.
477
+ *
478
+ * The implementation uses the inverse cumulative distribution function
479
+ * to generate normally distributed results.
480
+ *
481
+ * \param state - Pointer to state to update
482
+ *
483
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
484
+ */
485
+ QUALIFIERS float curand_normal(curandStateMtgp32_t *state)
486
+ {
487
+ return _curand_normal_icdf(curand(state));
488
+ }
489
+ /**
490
+ * \brief Return a normally distributed float from a Sobol32 generator.
491
+ *
492
+ * Return a single normally distributed float with mean \p 0.0f and
493
+ * standard deviation \p 1.0f from the Sobol32 generator in \p state,
494
+ * increment position of generator by one.
495
+ *
496
+ * The implementation uses the inverse cumulative distribution function
497
+ * to generate normally distributed results.
498
+ *
499
+ * \param state - Pointer to state to update
500
+ *
501
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
502
+ */
503
+ QUALIFIERS float curand_normal(curandStateSobol32_t *state)
504
+ {
505
+ return _curand_normal_icdf(curand(state));
506
+ }
507
+
508
+ /**
509
+ * \brief Return a normally distributed float from a scrambled Sobol32 generator.
510
+ *
511
+ * Return a single normally distributed float with mean \p 0.0f and
512
+ * standard deviation \p 1.0f from the scrambled Sobol32 generator in \p state,
513
+ * increment position of generator by one.
514
+ *
515
+ * The implementation uses the inverse cumulative distribution function
516
+ * to generate normally distributed results.
517
+ *
518
+ * \param state - Pointer to state to update
519
+ *
520
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
521
+ */
522
+ QUALIFIERS float curand_normal(curandStateScrambledSobol32_t *state)
523
+ {
524
+ return _curand_normal_icdf(curand(state));
525
+ }
526
+
527
+ /**
528
+ * \brief Return a normally distributed float from a Sobol64 generator.
529
+ *
530
+ * Return a single normally distributed float with mean \p 0.0f and
531
+ * standard deviation \p 1.0f from the Sobol64 generator in \p state,
532
+ * increment position of generator by one.
533
+ *
534
+ * The implementation uses the inverse cumulative distribution function
535
+ * to generate normally distributed results.
536
+ *
537
+ * \param state - Pointer to state to update
538
+ *
539
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
540
+ */
541
+ QUALIFIERS float curand_normal(curandStateSobol64_t *state)
542
+ {
543
+ return _curand_normal_icdf(curand(state));
544
+ }
545
+
546
+ /**
547
+ * \brief Return a normally distributed float from a scrambled Sobol64 generator.
548
+ *
549
+ * Return a single normally distributed float with mean \p 0.0f and
550
+ * standard deviation \p 1.0f from the scrambled Sobol64 generator in \p state,
551
+ * increment position of generator by one.
552
+ *
553
+ * The implementation uses the inverse cumulative distribution function
554
+ * to generate normally distributed results.
555
+ *
556
+ * \param state - Pointer to state to update
557
+ *
558
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
559
+ */
560
+ QUALIFIERS float curand_normal(curandStateScrambledSobol64_t *state)
561
+ {
562
+ return _curand_normal_icdf(curand(state));
563
+ }
564
+
565
+ /**
566
+ * \brief Return a normally distributed double from an XORWOW generator.
567
+ *
568
+ * Return a single normally distributed double with mean \p 0.0 and
569
+ * standard deviation \p 1.0 from the XORWOW generator in \p state,
570
+ * increment position of generator.
571
+ *
572
+ * The implementation uses a Box-Muller transform to generate two
573
+ * normally distributed results, then returns them one at a time.
574
+ * See ::curand_normal2_double() for a more efficient version that returns
575
+ * both results at once.
576
+ *
577
+ * \param state - Pointer to state to update
578
+ *
579
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
580
+ */
581
+ QUALIFIERS double curand_normal_double(curandStateXORWOW_t *state)
582
+ {
583
+ if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
584
+ unsigned int x0, x1, y0, y1;
585
+ x0 = curand(state);
586
+ x1 = curand(state);
587
+ y0 = curand(state);
588
+ y1 = curand(state);
589
+ double2 v = _curand_box_muller_double(x0, x1, y0, y1);
590
+ state->boxmuller_extra_double = v.y;
591
+ state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
592
+ return v.x;
593
+ }
594
+ state->boxmuller_flag_double = 0;
595
+ return state->boxmuller_extra_double;
596
+ }
597
+
598
+ /**
599
+ * \brief Return a normally distributed double from an Philox4_32_10 generator.
600
+ *
601
+ * Return a single normally distributed double with mean \p 0.0 and
602
+ * standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
603
+ * increment position of generator.
604
+ *
605
+ * The implementation uses a Box-Muller transform to generate two
606
+ * normally distributed results, then returns them one at a time.
607
+ * See ::curand_normal2_double() for a more efficient version that returns
608
+ * both results at once.
609
+ *
610
+ * \param state - Pointer to state to update
611
+ *
612
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
613
+ */
614
+
615
+ QUALIFIERS double curand_normal_double(curandStatePhilox4_32_10_t *state)
616
+ {
617
+ if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
618
+ uint4 _x;
619
+ _x = curand4(state);
620
+ double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
621
+ state->boxmuller_extra_double = v.y;
622
+ state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
623
+ return v.x;
624
+ }
625
+ state->boxmuller_flag_double = 0;
626
+ return state->boxmuller_extra_double;
627
+ }
628
+
629
+
630
+ /**
631
+ * \brief Return a normally distributed double from an MRG32k3a generator.
632
+ *
633
+ * Return a single normally distributed double with mean \p 0.0 and
634
+ * standard deviation \p 1.0 from the XORWOW generator in \p state,
635
+ * increment position of generator.
636
+ *
637
+ * The implementation uses a Box-Muller transform to generate two
638
+ * normally distributed results, then returns them one at a time.
639
+ * See ::curand_normal2_double() for a more efficient version that returns
640
+ * both results at once.
641
+ *
642
+ * \param state - Pointer to state to update
643
+ *
644
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
645
+ */
646
+ QUALIFIERS double curand_normal_double(curandStateMRG32k3a_t *state)
647
+ {
648
+ if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
649
+ double2 v = curand_box_muller_mrg_double(state);
650
+ state->boxmuller_extra_double = v.y;
651
+ state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
652
+ return v.x;
653
+ }
654
+ state->boxmuller_flag_double = 0;
655
+ return state->boxmuller_extra_double;
656
+ }
657
+
658
+ /**
659
+ * \brief Return two normally distributed doubles from an XORWOW generator.
660
+ *
661
+ * Return two normally distributed doubles with mean \p 0.0 and
662
+ * standard deviation \p 1.0 from the XORWOW generator in \p state,
663
+ * increment position of generator by 2.
664
+ *
665
+ * The implementation uses a Box-Muller transform to generate two
666
+ * normally distributed results.
667
+ *
668
+ * \param state - Pointer to state to update
669
+ *
670
+ * \return Normally distributed double2 where each element is from a
671
+ * distribution with mean \p 0.0 and standard deviation \p 1.0
672
+ */
673
+ QUALIFIERS double2 curand_normal2_double(curandStateXORWOW_t *state)
674
+ {
675
+ return curand_box_muller_double(state);
676
+ }
677
+
678
+ /**
679
+ * \brief Return two normally distributed doubles from an Philox4_32_10 generator.
680
+ *
681
+ * Return two normally distributed doubles with mean \p 0.0 and
682
+ * standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
683
+ * increment position of generator by 2.
684
+ *
685
+ * The implementation uses a Box-Muller transform to generate two
686
+ * normally distributed results.
687
+ *
688
+ * \param state - Pointer to state to update
689
+ *
690
+ * \return Normally distributed double2 where each element is from a
691
+ * distribution with mean \p 0.0 and standard deviation \p 1.0
692
+ */
693
+ QUALIFIERS double2 curand_normal2_double(curandStatePhilox4_32_10_t *state)
694
+ {
695
+ uint4 _x;
696
+ double2 result;
697
+
698
+ _x = curand4(state);
699
+ double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
700
+ result.x = v1.x;
701
+ result.y = v1.y;
702
+
703
+ return result;
704
+ }
705
+
706
+ // not a part of API
707
+ QUALIFIERS double4 curand_normal4_double(curandStatePhilox4_32_10_t *state)
708
+ {
709
+ uint4 _x;
710
+ uint4 _y;
711
+ double4 result;
712
+
713
+ _x = curand4(state);
714
+ _y = curand4(state);
715
+ double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
716
+ double2 v2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
717
+ result.x = v1.x;
718
+ result.y = v1.y;
719
+ result.z = v2.x;
720
+ result.w = v2.y;
721
+
722
+ return result;
723
+ }
724
+
725
+
726
+ /**
727
+ * \brief Return two normally distributed doubles from an MRG32k3a generator.
728
+ *
729
+ * Return two normally distributed doubles with mean \p 0.0 and
730
+ * standard deviation \p 1.0 from the MRG32k3a generator in \p state,
731
+ * increment position of generator.
732
+ *
733
+ * The implementation uses a Box-Muller transform to generate two
734
+ * normally distributed results.
735
+ *
736
+ * \param state - Pointer to state to update
737
+ *
738
+ * \return Normally distributed double2 where each element is from a
739
+ * distribution with mean \p 0.0 and standard deviation \p 1.0
740
+ */
741
+ QUALIFIERS double2 curand_normal2_double(curandStateMRG32k3a_t *state)
742
+ {
743
+ return curand_box_muller_mrg_double(state);
744
+ }
745
+
746
+ /**
747
+ * \brief Return a normally distributed double from an MTGP32 generator.
748
+ *
749
+ * Return a single normally distributed double with mean \p 0.0 and
750
+ * standard deviation \p 1.0 from the MTGP32 generator in \p state,
751
+ * increment position of generator.
752
+ *
753
+ * The implementation uses the inverse cumulative distribution function
754
+ * to generate normally distributed results.
755
+ *
756
+ * \param state - Pointer to state to update
757
+ *
758
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
759
+ */
760
+ QUALIFIERS double curand_normal_double(curandStateMtgp32_t *state)
761
+ {
762
+ return _curand_normal_icdf_double(curand(state));
763
+ }
764
+
765
+ /**
766
+ * \brief Return a normally distributed double from an Sobol32 generator.
767
+ *
768
+ * Return a single normally distributed double with mean \p 0.0 and
769
+ * standard deviation \p 1.0 from the Sobol32 generator in \p state,
770
+ * increment position of generator by one.
771
+ *
772
+ * The implementation uses the inverse cumulative distribution function
773
+ * to generate normally distributed results.
774
+ *
775
+ * \param state - Pointer to state to update
776
+ *
777
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
778
+ */
779
+ QUALIFIERS double curand_normal_double(curandStateSobol32_t *state)
780
+ {
781
+ return _curand_normal_icdf_double(curand(state));
782
+ }
783
+
784
+ /**
785
+ * \brief Return a normally distributed double from a scrambled Sobol32 generator.
786
+ *
787
+ * Return a single normally distributed double with mean \p 0.0 and
788
+ * standard deviation \p 1.0 from the scrambled Sobol32 generator in \p state,
789
+ * increment position of generator by one.
790
+ *
791
+ * The implementation uses the inverse cumulative distribution function
792
+ * to generate normally distributed results.
793
+ *
794
+ * \param state - Pointer to state to update
795
+ *
796
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
797
+ */
798
+ QUALIFIERS double curand_normal_double(curandStateScrambledSobol32_t *state)
799
+ {
800
+ return _curand_normal_icdf_double(curand(state));
801
+ }
802
+
803
+ /**
804
+ * \brief Return a normally distributed double from a Sobol64 generator.
805
+ *
806
+ * Return a single normally distributed double with mean \p 0.0 and
807
+ * standard deviation \p 1.0 from the Sobol64 generator in \p state,
808
+ * increment position of generator by one.
809
+ *
810
+ * The implementation uses the inverse cumulative distribution function
811
+ * to generate normally distributed results.
812
+ *
813
+ * \param state - Pointer to state to update
814
+ *
815
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
816
+ */
817
+ QUALIFIERS double curand_normal_double(curandStateSobol64_t *state)
818
+ {
819
+ return _curand_normal_icdf_double(curand(state));
820
+ }
821
+
822
+ /**
823
+ * \brief Return a normally distributed double from a scrambled Sobol64 generator.
824
+ *
825
+ * Return a single normally distributed double with mean \p 0.0 and
826
+ * standard deviation \p 1.0 from the scrambled Sobol64 generator in \p state,
827
+ * increment position of generator by one.
828
+ *
829
+ * The implementation uses the inverse cumulative distribution function
830
+ * to generate normally distributed results.
831
+ *
832
+ * \param state - Pointer to state to update
833
+ *
834
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
835
+ */
836
+ QUALIFIERS double curand_normal_double(curandStateScrambledSobol64_t *state)
837
+ {
838
+ return _curand_normal_icdf_double(curand(state));
839
+ }
840
+ #endif // !defined(CURAND_NORMAL_H_)
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal_static.h ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+ #ifndef CURAND_NORMAL_STATIC_H
49
+ #define CURAND_NORMAL_STATIC_H
50
+
51
+ #define QUALIFIERS_STATIC __host__ __device__ __forceinline__
52
+
53
+ #include <nv/target>
54
+ #if defined(HOST_HAVE_ERFCINVF)
55
+ #define IF_DEVICE_OR_HAVE_ERFCINVF(t, f) _NV_BLOCK_EXPAND(t)
56
+ #else
57
+ #define IF_DEVICE_OR_HAVE_ERFCINVF(t, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, t, f)
58
+ #endif
59
+
60
+ QUALIFIERS_STATIC float _curand_normal_icdf(unsigned int x)
61
+ {
62
+ IF_DEVICE_OR_HAVE_ERFCINVF(
63
+ float s = CURAND_SQRT2;
64
+ // Mirror to avoid loss of precision
65
+ if(x > 0x80000000UL) {
66
+ x = 0xffffffffUL - x;
67
+ s = -s;
68
+ }
69
+ float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
70
+ // p is in (0, 0.5], 2p is in (0, 1]
71
+ return s * erfcinvf(2.0f * p);
72
+ ,
73
+ x++; //suppress warnings
74
+ return 0.0f;
75
+ )
76
+ }
77
+
78
+ QUALIFIERS_STATIC float _curand_normal_icdf(unsigned long long x)
79
+ {
80
+ IF_DEVICE_OR_HAVE_ERFCINVF(
81
+ unsigned int t = (unsigned int)(x >> 32);
82
+ float s = CURAND_SQRT2;
83
+ // Mirror to avoid loss of precision
84
+ if(t > 0x80000000UL) {
85
+ t = 0xffffffffUL - t;
86
+ s = -s;
87
+ }
88
+ float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
89
+ // p is in (0 - 0.5] 2p is in (0 - 1]
90
+ return s * erfcinvf(2.0f * p);
91
+ ,
92
+ x++;
93
+ return 0.0f;
94
+ )
95
+ }
96
+
97
+ QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned int x)
98
+ {
99
+ IF_DEVICE_OR_HAVE_ERFCINVF(
100
+ double s = CURAND_SQRT2_DOUBLE;
101
+ // Mirror to avoid loss of precision
102
+ if(x > 0x80000000UL) {
103
+ x = 0xffffffffUL - x;
104
+ s = -s;
105
+ }
106
+ double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
107
+ // p is in (0 - 0.5] 2p is in (0 - 1]
108
+ return s * erfcinv(2.0 * p);
109
+ ,
110
+ x++;
111
+ return 0.0;
112
+ )
113
+ }
114
+
115
+ QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned long long x)
116
+ {
117
+ IF_DEVICE_OR_HAVE_ERFCINVF(
118
+ double s = CURAND_SQRT2_DOUBLE;
119
+ x >>= 11;
120
+ // Mirror to avoid loss of precision
121
+ if(x > 0x10000000000000UL) {
122
+ x = 0x1fffffffffffffUL - x;
123
+ s = -s;
124
+ }
125
+ double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
126
+ // p is in (0 - 0.5] 2p is in (0 - 1]
127
+ return s * erfcinv(2.0 * p);
128
+ ,
129
+ x++;
130
+ return 0.0;
131
+ )
132
+ }
133
+ #undef QUALIFIERS_STATIC
134
+ #endif
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_philox4x32_x.h ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
2
+ *
3
+ * NOTICE TO LICENSEE:
4
+ *
5
+ * The source code and/or documentation ("Licensed Deliverables") are
6
+ * subject to NVIDIA intellectual property rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * The Licensed Deliverables contained herein are PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
11
+ * conditions of a form of NVIDIA software license agreement by and
12
+ * between NVIDIA and Licensee ("License Agreement") or electronically
13
+ * accepted by Licensee. Notwithstanding any terms or conditions to
14
+ * the contrary in the License Agreement, reproduction or disclosure
15
+ * of the Licensed Deliverables to any third party without the express
16
+ * written consent of NVIDIA is prohibited.
17
+ *
18
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
19
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
20
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
21
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
22
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
23
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
24
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
25
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
26
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
27
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
28
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
29
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
30
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
31
+ * OF THESE LICENSED DELIVERABLES.
32
+ *
33
+ * U.S. Government End Users. These Licensed Deliverables are a
34
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
35
+ * 1995), consisting of "commercial computer software" and "commercial
36
+ * computer software documentation" as such terms are used in 48
37
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
38
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
39
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
40
+ * U.S. Government End Users acquire the Licensed Deliverables with
41
+ * only those rights set forth herein.
42
+ *
43
+ * Any use of the Licensed Deliverables in individual and commercial
44
+ * software must include, in the user documentation and internal
45
+ * comments to the code, the above Disclaimer and U.S. Government End
46
+ * Users Notice.
47
+ */
48
+ /*
49
+ Copyright 2010-2011, D. E. Shaw Research.
50
+ All rights reserved.
51
+
52
+ Redistribution and use in source and binary forms, with or without
53
+ modification, are permitted provided that the following conditions are
54
+ met:
55
+
56
+ * Redistributions of source code must retain the above copyright
57
+ notice, this list of conditions, and the following disclaimer.
58
+
59
+ * Redistributions in binary form must reproduce the above copyright
60
+ notice, this list of conditions, and the following disclaimer in the
61
+ documentation and/or other materials provided with the distribution.
62
+
63
+ * Neither the name of D. E. Shaw Research nor the names of its
64
+ contributors may be used to endorse or promote products derived from
65
+ this software without specific prior written permission.
66
+
67
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
68
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
69
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
70
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
71
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
72
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
73
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
74
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
75
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
76
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
77
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
78
+ */
79
+
80
+ #ifndef CURAND_PHILOX4X32_X__H_
81
+ #define CURAND_PHILOX4X32_X__H_
82
+ #include <nv/target>
83
+
84
+ #if !defined(QUALIFIERS)
85
+ #define QUALIFIERS static __forceinline__ __device__
86
+ #endif
87
+
88
+ #define PHILOX_W32_0 (0x9E3779B9)
89
+ #define PHILOX_W32_1 (0xBB67AE85)
90
+ #define PHILOX_M4x32_0 (0xD2511F53)
91
+ #define PHILOX_M4x32_1 (0xCD9E8D57)
92
+
93
+ struct curandStatePhilox4_32_10 {
94
+ uint4 ctr;
95
+ uint4 output;
96
+ uint2 key;
97
+ unsigned int STATE;
98
+ int boxmuller_flag;
99
+ int boxmuller_flag_double;
100
+ float boxmuller_extra;
101
+ double boxmuller_extra_double;
102
+ };
103
+
104
+ typedef struct curandStatePhilox4_32_10 curandStatePhilox4_32_10_t;
105
+
106
+
107
+ QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s, unsigned long long n)
108
+ {
109
+ unsigned int nlo = (unsigned int)(n);
110
+ unsigned int nhi = (unsigned int)(n>>32);
111
+
112
+ s->ctr.x += nlo;
113
+ if( s->ctr.x < nlo )
114
+ nhi++;
115
+
116
+ s->ctr.y += nhi;
117
+ if(nhi <= s->ctr.y)
118
+ return;
119
+ if(++s->ctr.z) return;
120
+ ++s->ctr.w;
121
+ }
122
+
123
+ QUALIFIERS void Philox_State_Incr_hi(curandStatePhilox4_32_10_t* s, unsigned long long n)
124
+ {
125
+ unsigned int nlo = (unsigned int)(n);
126
+ unsigned int nhi = (unsigned int)(n>>32);
127
+
128
+ s->ctr.z += nlo;
129
+ if( s->ctr.z < nlo )
130
+ nhi++;
131
+
132
+ s->ctr.w += nhi;
133
+ }
134
+
135
+
136
+
137
+ QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s)
138
+ {
139
+ if(++s->ctr.x) return;
140
+ if(++s->ctr.y) return;
141
+ if(++s->ctr.z) return;
142
+ ++s->ctr.w;
143
+ }
144
+
145
+
146
+ QUALIFIERS unsigned int mulhilo32(unsigned int a, unsigned int b, unsigned int* hip)
147
+ {
148
+ NV_IF_ELSE_TARGET(NV_IS_HOST,
149
+ // host code
150
+ unsigned long long product = ((unsigned long long)a) * ((unsigned long long)b);
151
+ *hip = product >> 32;
152
+ return (unsigned int)product;
153
+ ,
154
+ // device code
155
+ *hip = __umulhi(a,b);
156
+ return a*b;
157
+ )
158
+ }
159
+
160
+ QUALIFIERS uint4 _philox4x32round(uint4 ctr, uint2 key)
161
+ {
162
+ unsigned int hi0;
163
+ unsigned int hi1;
164
+ unsigned int lo0 = mulhilo32(PHILOX_M4x32_0, ctr.x, &hi0);
165
+ unsigned int lo1 = mulhilo32(PHILOX_M4x32_1, ctr.z, &hi1);
166
+
167
+ uint4 ret = {hi1^ctr.y^key.x, lo1, hi0^ctr.w^key.y, lo0};
168
+ return ret;
169
+ }
170
+
171
+ QUALIFIERS uint4 curand_Philox4x32_10( uint4 c, uint2 k)
172
+ {
173
+ c = _philox4x32round(c, k); // 1
174
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
175
+ c = _philox4x32round(c, k); // 2
176
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
177
+ c = _philox4x32round(c, k); // 3
178
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
179
+ c = _philox4x32round(c, k); // 4
180
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
181
+ c = _philox4x32round(c, k); // 5
182
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
183
+ c = _philox4x32round(c, k); // 6
184
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
185
+ c = _philox4x32round(c, k); // 7
186
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
187
+ c = _philox4x32round(c, k); // 8
188
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
189
+ c = _philox4x32round(c, k); // 9
190
+ k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
191
+ return _philox4x32round(c, k); // 10
192
+ }
193
+
194
+
195
+ #endif
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_poisson.h ADDED
@@ -0,0 +1,763 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* Copyright 2010-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * The source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * The Licensed Deliverables contained herein are PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+
51
+ #if !defined(CURAND_POISSON_H_)
52
+ #define CURAND_POISSON_H_
53
+
54
+ /**
55
+ * \defgroup DEVICE Device API
56
+ *
57
+ * @{
58
+ */
59
+
60
+ #ifndef __CUDACC_RTC__
61
+ #include <math.h>
62
+ #endif // __CUDACC_RTC__
63
+
64
+ #include <nv/target>
65
+
66
+ #include "curand_mrg32k3a.h"
67
+ #include "curand_mtgp32_kernel.h"
68
+ #include "curand_philox4x32_x.h"
69
+
70
+ #define CR_CUDART_PI 3.1415926535897931e+0
71
+ #define CR_CUDART_TWO_TO_52 4503599627370496.0
72
+
73
+
74
+ QUALIFIERS float __cr_rsqrt(float a)
75
+ {
76
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
77
+ asm ("rsqrt.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
78
+ ,
79
+ a = 1.0f / sqrtf (a);
80
+ )
81
+ return a;
82
+ }
83
+
84
+ QUALIFIERS float __cr_exp (float a)
85
+ {
86
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
87
+ a = a * 1.4426950408889634074;
88
+ asm ("ex2.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
89
+ ,
90
+ a = expf (a);
91
+ )
92
+ return a;
93
+ }
94
+
95
+ QUALIFIERS float __cr_log (float a)
96
+ {
97
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
98
+ asm ("lg2.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
99
+ a = a * 0.69314718055994530942;
100
+ ,
101
+ a = logf (a);
102
+ )
103
+ return a;
104
+ }
105
+
106
+ QUALIFIERS float __cr_rcp (float a)
107
+ {
108
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
109
+ asm ("rcp.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
110
+ ,
111
+ a = 1.0f / a;
112
+ )
113
+ return a;
114
+ }
115
+
116
+ /* Computes regularized gamma function: gammainc(a,x)/gamma(a) */
117
+ QUALIFIERS float __cr_pgammainc (float a, float x)
118
+ {
119
+ float t, alpha, beta;
120
+
121
+ /* First level parametrization constants */
122
+ float ma1 = 1.43248035075540910f,
123
+ ma2 = 0.12400979329415655f,
124
+ ma3 = 0.00025361074907033f,
125
+ mb1 = 0.21096734870196546f,
126
+ mb2 = 1.97381164089999420f,
127
+ mb3 = 0.94201734077887530f;
128
+
129
+ /* Second level parametrization constants (depends only on a) */
130
+
131
+ alpha = __cr_rsqrt (a - ma2);
132
+ alpha = ma1 * alpha + ma3;
133
+ beta = __cr_rsqrt (a - mb2);
134
+ beta = mb1 * beta + mb3;
135
+
136
+ /* Final approximation (depends on a and x) */
137
+
138
+ t = a - x;
139
+ t = alpha * t - beta;
140
+ t = 1.0f + __cr_exp (t);
141
+ t = t * t;
142
+ t = __cr_rcp (t);
143
+
144
+ /* Negative a,x or a,x=NAN requires special handling */
145
+ //t = !(x > 0 && a >= 0) ? 0.0 : t;
146
+
147
+ return t;
148
+ }
149
+
150
+ /* Computes inverse of pgammainc */
151
+ QUALIFIERS float __cr_pgammaincinv (float a, float y)
152
+ {
153
+ float t, alpha, beta;
154
+
155
+ /* First level parametrization constants */
156
+
157
+ float ma1 = 1.43248035075540910f,
158
+ ma2 = 0.12400979329415655f,
159
+ ma3 = 0.00025361074907033f,
160
+ mb1 = 0.21096734870196546f,
161
+ mb2 = 1.97381164089999420f,
162
+ mb3 = 0.94201734077887530f;
163
+
164
+ /* Second level parametrization constants (depends only on a) */
165
+
166
+ alpha = __cr_rsqrt (a - ma2);
167
+ alpha = ma1 * alpha + ma3;
168
+ beta = __cr_rsqrt (a - mb2);
169
+ beta = mb1 * beta + mb3;
170
+
171
+ /* Final approximation (depends on a and y) */
172
+
173
+ t = __cr_rsqrt (y) - 1.0f;
174
+ t = __cr_log (t);
175
+ t = beta + t;
176
+ t = - t * __cr_rcp (alpha) + a;
177
+ /* Negative a,x or a,x=NAN requires special handling */
178
+ //t = !(y > 0 && a >= 0) ? 0.0 : t;
179
+ return t;
180
+ }
181
+
182
+ #if defined(__CUDACC_RDC__) && (__cplusplus >= 201703L) && defined(__cpp_inline_variables)
183
+ inline __constant__ double __cr_lgamma_table [] = {
184
+ #else
185
+ static __constant__ double __cr_lgamma_table [] = {
186
+ #endif
187
+ 0.000000000000000000e-1,
188
+ 0.000000000000000000e-1,
189
+ 6.931471805599453094e-1,
190
+ 1.791759469228055001e0,
191
+ 3.178053830347945620e0,
192
+ 4.787491742782045994e0,
193
+ 6.579251212010100995e0,
194
+ 8.525161361065414300e0,
195
+ 1.060460290274525023e1
196
+ };
197
+
198
+
199
+ QUALIFIERS double __cr_lgamma_integer(int a)
200
+ {
201
+ double s;
202
+ double t;
203
+ double fa = fabs((float)a);
204
+ double sum;
205
+
206
+ if (a > 8) {
207
+ /* Stirling approximation; coefficients from Hart et al, "Computer
208
+ * Approximations", Wiley 1968. Approximation 5404.
209
+ */
210
+ s = 1.0 / fa;
211
+ t = s * s;
212
+ sum = -0.1633436431e-2;
213
+ sum = sum * t + 0.83645878922e-3;
214
+ sum = sum * t - 0.5951896861197e-3;
215
+ sum = sum * t + 0.793650576493454e-3;
216
+ sum = sum * t - 0.277777777735865004e-2;
217
+ sum = sum * t + 0.833333333333331018375e-1;
218
+ sum = sum * s + 0.918938533204672;
219
+ s = 0.5 * log (fa);
220
+ t = fa - 0.5;
221
+ s = s * t;
222
+ t = s - fa;
223
+ s = s + sum;
224
+ t = t + s;
225
+ return t;
226
+ } else {
227
+ NV_IF_ELSE_TARGET(NV_IS_DEVICE,
228
+ return __cr_lgamma_table [(int) fa-1];
229
+ ,
230
+ switch(a) {
231
+ case 1: return 0.000000000000000000e-1;
232
+ case 2: return 0.000000000000000000e-1;
233
+ case 3: return 6.931471805599453094e-1;
234
+ case 4: return 1.791759469228055001e0;
235
+ case 5: return 3.178053830347945620e0;
236
+ case 6: return 4.787491742782045994e0;
237
+ case 7: return 6.579251212010100995e0;
238
+ case 8: return 8.525161361065414300e0;
239
+ default: return 1.060460290274525023e1;
240
+ }
241
+ )
242
+ }
243
+ }
244
+
245
+ #define KNUTH_FLOAT_CONST 60.0
246
+ template <typename T>
247
+ // Donald E. Knuth Seminumerical Algorithms. The Art of Computer Programming, Volume 2
248
+ QUALIFIERS unsigned int curand_poisson_knuth(T *state, float lambda)
249
+ {
250
+ unsigned int k = 0;
251
+ float p = expf(lambda);
252
+ do{
253
+ k++;
254
+ p *= curand_uniform(state);
255
+ }while (p > 1.0);
256
+ return k-1;
257
+ }
258
+
259
+ template <typename T>
260
+ // Donald E. Knuth Seminumerical Algorithms. The Art of Computer Programming, Volume 2
261
+ QUALIFIERS uint4 curand_poisson_knuth4(T *state, float lambda)
262
+ {
263
+ uint4 k = {0,0,0,0};
264
+ float exp_lambda = expf(lambda);
265
+ float4 p={ exp_lambda,exp_lambda,exp_lambda,exp_lambda };
266
+ do{
267
+ k.x++;
268
+ p.x *= curand_uniform(state);
269
+ }while (p.x > 1.0);
270
+ do{
271
+ k.y++;
272
+ p.y *= curand_uniform(state);
273
+ }while (p.y > 1.0);
274
+ do{
275
+ k.z++;
276
+ p.z *= curand_uniform(state);
277
+ }while (p.z > 1.0);
278
+ do{
279
+ k.w++;
280
+ p.w *= curand_uniform(state);
281
+ }while (p.w > 1.0);
282
+
283
+ k.x--;
284
+ k.y--;
285
+ k.z--;
286
+ k.w--;
287
+ return k;
288
+ }
289
+
290
+ template <typename T>
291
+ // Marsaglia, Tsang, Wang Journal of Statistical Software, square histogram.
292
+ QUALIFIERS unsigned int _curand_M2_double(T x, curandDistributionM2Shift_t distributionM2)
293
+ {
294
+ double u = _curand_uniform_double(x);
295
+ int j = (int) floor(distributionM2->length*u);
296
+
297
+ double histogramVj;
298
+ unsigned int histogramKj;
299
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_35,
300
+ histogramVj = __ldg( &(distributionM2->histogram->V[j]));
301
+ histogramKj = __ldg( &(distributionM2->histogram->K[j]));
302
+ ,
303
+ histogramVj = distributionM2->histogram->V[j];
304
+ histogramKj = distributionM2->histogram->K[j];
305
+ )
306
+ //if (u < distributionM2->histogram->V[j]) return distributionM2->shift + j;
307
+ //return distributionM2->shift + distributionM2->histogram->K[j];
308
+ if (u < histogramVj) return distributionM2->shift + j;
309
+ return distributionM2->shift + histogramKj;
310
+ }
311
+
312
+ template <typename T>
313
+ // Marsaglia, Tsang, Wang Journal of Statistical Software, square histogram.
314
+ QUALIFIERS uint4 _curand_M2_double4(T x, curandDistributionM2Shift_t distributionM2)
315
+ {
316
+ double4 u;
317
+ uint4 result = {0,0,0,0};
318
+ int4 flag = {1,1,1,1};
319
+
320
+ u.x = _curand_uniform_double(x.x);
321
+ u.y = _curand_uniform_double(x.y);
322
+ u.z = _curand_uniform_double(x.z);
323
+ u.w = _curand_uniform_double(x.w);
324
+
325
+ int4 j;
326
+ j.x = (int) floor(distributionM2->length*u.x);
327
+ j.y = (int) floor(distributionM2->length*u.y);
328
+ j.z = (int) floor(distributionM2->length*u.z);
329
+ j.w = (int) floor(distributionM2->length*u.w);
330
+ // int result;
331
+
332
+ double histogramVjx;
333
+ double histogramVjy;
334
+ double histogramVjz;
335
+ double histogramVjw;
336
+ unsigned int histogramKjx;
337
+ unsigned int histogramKjy;
338
+ unsigned int histogramKjz;
339
+ unsigned int histogramKjw;
340
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_35,
341
+ histogramVjx = __ldg( &(distributionM2->histogram->V[j.x]));
342
+ histogramVjy = __ldg( &(distributionM2->histogram->V[j.y]));
343
+ histogramVjz = __ldg( &(distributionM2->histogram->V[j.z]));
344
+ histogramVjw = __ldg( &(distributionM2->histogram->V[j.w]));
345
+
346
+ histogramKjx = __ldg( &(distributionM2->histogram->K[j.x]));
347
+ histogramKjy = __ldg( &(distributionM2->histogram->K[j.y]));
348
+ histogramKjz = __ldg( &(distributionM2->histogram->K[j.z]));
349
+ histogramKjw = __ldg( &(distributionM2->histogram->K[j.w]));
350
+ ,
351
+ histogramVjx = distributionM2->histogram->V[j.x];
352
+ histogramVjy = distributionM2->histogram->V[j.y];
353
+ histogramVjz = distributionM2->histogram->V[j.z];
354
+ histogramVjw = distributionM2->histogram->V[j.w];
355
+
356
+ histogramKjx = distributionM2->histogram->K[j.x];
357
+ histogramKjy = distributionM2->histogram->K[j.y];
358
+ histogramKjz = distributionM2->histogram->K[j.z];
359
+ histogramKjw = distributionM2->histogram->K[j.w];
360
+ )
361
+
362
+ if (u.x < histogramVjx){ result.x = distributionM2->shift + j.x; flag.x = 0; }
363
+ if (u.y < histogramVjy){ result.y = distributionM2->shift + j.y; flag.y = 0; }
364
+ if (u.z < histogramVjz){ result.z = distributionM2->shift + j.z; flag.z = 0; }
365
+ if (u.w < histogramVjw){ result.w = distributionM2->shift + j.w; flag.w = 0; }
366
+ //return distributionM2->shift + distributionM2->histogram->K[j];
367
+
368
+ if(flag.x) result.x = distributionM2->shift + histogramKjx;
369
+ if(flag.y) result.y = distributionM2->shift + histogramKjy;
370
+ if(flag.z) result.z = distributionM2->shift + histogramKjz;
371
+ if(flag.w) result.w = distributionM2->shift + histogramKjw;
372
+
373
+ return result;
374
+ }
375
+
376
+ template <typename STATE>
377
+ QUALIFIERS unsigned int curand_M2_double(STATE *state, curandDistributionM2Shift_t distributionM2)
378
+ {
379
+ return _curand_M2_double(curand(state), distributionM2);
380
+ }
381
+
382
+ template <typename STATE>
383
+ QUALIFIERS uint4 curand_M2_double4(STATE *state, curandDistributionM2Shift_t distributionM2)
384
+ {
385
+ return _curand_M2_double4(curand4(state), distributionM2);
386
+ }
387
+
388
+
389
+ template <typename T>
390
+ QUALIFIERS unsigned int _curand_binary_search_double(T x, curandDistributionShift_t distribution)
391
+ {
392
+ double u = _curand_uniform_double(x);
393
+ int min = 0;
394
+ int max = distribution->length-1;
395
+ do{
396
+ int mid = (max + min)/2;
397
+ double probability_mid;
398
+ NV_IF_ELSE_TARGET(NV_PROVIDES_SM_35,
399
+ probability_mid = __ldg( &(distribution->probability[mid]));
400
+ ,
401
+ probability_mid = distribution->probability[mid];
402
+ )
403
+ if (u <= probability_mid){
404
+ max = mid;
405
+ }else{
406
+ min = mid+1;
407
+ }
408
+ }while (min < max);
409
+ return distribution->shift + min;
410
+ }
411
+
412
+ template <typename STATE>
413
+ QUALIFIERS unsigned int curand_binary_search_double(STATE *state, curandDistributionShift_t distribution)
414
+ {
415
+ return _curand_binary_search_double(curand(state), distribution);
416
+ }
417
+
418
+ // Generates uniformly distributed double values in range (0.0; 1.0) from uniformly distributed
419
+ // unsigned int. We can't use standard _curand_uniform_double since it can generate 1.0.
420
+ // This is required only for _curand_poisson_ITR_double.
421
+ QUALIFIERS double _curand_uniform_double_excluding_one(unsigned int x)
422
+ {
423
+ return x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
424
+ }
425
+
426
+ // Overload for unsigned long long.
427
+ // This is required only for _curand_poisson_ITR_double.
428
+ QUALIFIERS double _curand_uniform_double_excluding_one(unsigned long long x)
429
+ {
430
+ return (x >> 11) * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/4.0);
431
+ }
432
+
433
+ #define MAGIC_DOUBLE_CONST 500.0
434
+ template <typename T>
435
+ //George S. Fishman Discrete-event simulation: modeling, programming, and analysis
436
+ QUALIFIERS unsigned int _curand_poisson_ITR_double(T x, double lambda)
437
+ {
438
+ double L,p = 1.0;
439
+ double q = 1.0;
440
+ unsigned int k = 0;
441
+ int pow=0;
442
+ // This algorithm requires u to be in (0;1) range, however, _curand_uniform_double
443
+ // returns a number in range (0;1]. If u is 1.0 the inner loop never ends. The
444
+ // following operation transforms the range from (0;1] to (0;1).
445
+ double u = _curand_uniform_double_excluding_one(x);
446
+ do{
447
+ if (lambda > (double)(pow+MAGIC_DOUBLE_CONST)){
448
+ L = exp(-MAGIC_DOUBLE_CONST);
449
+ }else{
450
+ L = exp((double)(pow - lambda));
451
+ }
452
+ p *= L;
453
+ q *= L;
454
+ pow += (int) MAGIC_DOUBLE_CONST;
455
+ while (u > q){
456
+ k++;
457
+ p *= ((double)lambda / (double) k);
458
+ q += p;
459
+ }
460
+ }while((double)pow < lambda);
461
+ return k;
462
+ }
463
+
464
+ template <typename T>
465
+ /* Rejection Method for Poisson distribution based on gammainc approximation */
466
+ QUALIFIERS unsigned int curand_poisson_gammainc(T state, float lambda){
467
+ float y, x, t, z,v;
468
+ float logl = __cr_log (lambda);
469
+ while (true) {
470
+ y = curand_uniform (state);
471
+ x = __cr_pgammaincinv (lambda, y);
472
+ x = floorf (x);
473
+ z = curand_uniform (state);
474
+ v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
475
+ z = z*v;
476
+ t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
477
+ if ((z < t) && (v>=1e-20))
478
+ break;
479
+ }
480
+ return (unsigned int)x;
481
+ }
482
+
483
+ template <typename T>
484
+ /* Rejection Method for Poisson distribution based on gammainc approximation */
485
+ QUALIFIERS uint4 curand_poisson_gammainc4(T state, float lambda){
486
+ uint4 result;
487
+ float y, x, t, z,v;
488
+ float logl = __cr_log (lambda);
489
+ while (true) {
490
+ y = curand_uniform(state);
491
+ x = __cr_pgammaincinv (lambda, y);
492
+ x = floorf (x);
493
+ z = curand_uniform (state);
494
+ v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
495
+ z = z*v;
496
+ t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
497
+ if ((z < t) && (v>=1e-20))
498
+ break;
499
+ }
500
+ result.x = (unsigned int)x;
501
+
502
+ while (true) {
503
+ y = curand_uniform(state);
504
+ x = __cr_pgammaincinv (lambda, y);
505
+ x = floorf (x);
506
+ z = curand_uniform (state);
507
+ v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
508
+ z = z*v;
509
+ t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
510
+ if ((z < t) && (v>=1e-20))
511
+ break;
512
+ }
513
+ result.y = (unsigned int)x;
514
+
515
+ while (true) {
516
+ y = curand_uniform(state);
517
+ x = __cr_pgammaincinv (lambda, y);
518
+ x = floorf (x);
519
+ z = curand_uniform (state);
520
+ v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
521
+ z = z*v;
522
+ t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
523
+ if ((z < t) && (v>=1e-20))
524
+ break;
525
+ }
526
+ result.z = (unsigned int)x;
527
+
528
+ while (true) {
529
+ y = curand_uniform(state);
530
+ x = __cr_pgammaincinv (lambda, y);
531
+ x = floorf (x);
532
+ z = curand_uniform (state);
533
+ v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
534
+ z = z*v;
535
+ t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
536
+ if ((z < t) && (v>=1e-20))
537
+ break;
538
+ }
539
+ result.w = (unsigned int)x;
540
+
541
+ return result;
542
+ }
543
+ // Note below that the round to nearest integer, where needed,is done in line with code that
544
+ // assumes the range of values is < 2**32
545
+
546
+ template <typename T>
547
+ QUALIFIERS unsigned int _curand_poisson(T x, double lambda)
548
+ {
549
+ if (lambda < 1000)
550
+ return _curand_poisson_ITR_double(x, lambda);
551
+ return (unsigned int)((sqrt(lambda) * _curand_normal_icdf_double(x)) + lambda + 0.5); //Round to nearest
552
+ }
553
+
554
+ template <typename T>
555
+ QUALIFIERS unsigned int _curand_poisson_from_normal(T x, double lambda)
556
+ {
557
+ return (unsigned int)((sqrt(lambda) * _curand_normal_icdf(x)) + lambda + 0.5); //Round to nearest
558
+ }
559
+
560
+ template <typename STATE>
561
+ QUALIFIERS unsigned int curand_poisson_from_normal(STATE state, double lambda)
562
+ {
563
+ return (unsigned int)((sqrt(lambda) * curand_normal(state)) + lambda + 0.5); //Round to nearest
564
+ }
565
+
566
+ template <typename STATE>
567
+ QUALIFIERS uint4 curand_poisson_from_normal4(STATE state, double lambda)
568
+ {
569
+ uint4 result;
570
+ float4 _res;
571
+
572
+ _res = curand_normal4(state);
573
+
574
+ result.x = (unsigned int)((sqrt(lambda) * _res.x) + lambda + 0.5); //Round to nearest
575
+ result.y = (unsigned int)((sqrt(lambda) * _res.y) + lambda + 0.5); //Round to nearest
576
+ result.z = (unsigned int)((sqrt(lambda) * _res.z) + lambda + 0.5); //Round to nearest
577
+ result.w = (unsigned int)((sqrt(lambda) * _res.w) + lambda + 0.5); //Round to nearest
578
+ return result; //Round to nearest
579
+ }
580
+
581
+ /**
582
+ * \brief Return a Poisson-distributed unsigned int from a XORWOW generator.
583
+ *
584
+ * Return a single unsigned int from a Poisson
585
+ * distribution with lambda \p lambda from the XORWOW generator in \p state,
586
+ * increment the position of the generator by a variable amount, depending
587
+ * on the algorithm used.
588
+ *
589
+ * \param state - Pointer to state to update
590
+ * \param lambda - Lambda of the Poisson distribution
591
+ *
592
+ * \return Poisson-distributed unsigned int with lambda \p lambda
593
+ */
594
+ QUALIFIERS unsigned int curand_poisson(curandStateXORWOW_t *state, double lambda)
595
+ {
596
+ if (lambda < 64)
597
+ return curand_poisson_knuth(state, (float)lambda);
598
+ if (lambda > 4000)
599
+ return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
600
+ return curand_poisson_gammainc(state, (float)lambda);
601
+ }
602
+
603
+ /**
604
+ * \brief Return a Poisson-distributed unsigned int from a Philox4_32_10 generator.
605
+ *
606
+ * Return a single unsigned int from a Poisson
607
+ * distribution with lambda \p lambda from the Philox4_32_10 generator in \p state,
608
+ * increment the position of the generator by a variable amount, depending
609
+ * on the algorithm used.
610
+ *
611
+ * \param state - Pointer to state to update
612
+ * \param lambda - Lambda of the Poisson distribution
613
+ *
614
+ * \return Poisson-distributed unsigned int with lambda \p lambda
615
+ */
616
+ QUALIFIERS unsigned int curand_poisson(curandStatePhilox4_32_10_t *state, double lambda)
617
+ {
618
+ if (lambda < 64)
619
+ return curand_poisson_knuth(state, (float)lambda);
620
+ if (lambda > 4000)
621
+ return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
622
+ return curand_poisson_gammainc(state, (float)lambda);
623
+ }
624
+ /**
625
+ * \brief Return four Poisson-distributed unsigned ints from a Philox4_32_10 generator.
626
+ *
627
+ * Return a four unsigned ints from a Poisson
628
+ * distribution with lambda \p lambda from the Philox4_32_10 generator in \p state,
629
+ * increment the position of the generator by a variable amount, depending
630
+ * on the algorithm used.
631
+ *
632
+ * \param state - Pointer to state to update
633
+ * \param lambda - Lambda of the Poisson distribution
634
+ *
635
+ * \return Poisson-distributed unsigned int with lambda \p lambda
636
+ */
637
+ QUALIFIERS uint4 curand_poisson4(curandStatePhilox4_32_10_t *state, double lambda)
638
+ {
639
+ uint4 result;
640
+ double4 _res;
641
+ if (lambda < 64)
642
+ return curand_poisson_knuth4(state, (float)lambda);
643
+ if (lambda > 4000) {
644
+ _res = curand_normal4_double(state);
645
+ result.x = (unsigned int)((sqrt(lambda) * _res.x) + lambda + 0.5); //Round to nearest
646
+ result.y = (unsigned int)((sqrt(lambda) * _res.y) + lambda + 0.5); //Round to nearest
647
+ result.z = (unsigned int)((sqrt(lambda) * _res.z) + lambda + 0.5); //Round to nearest
648
+ result.w = (unsigned int)((sqrt(lambda) * _res.w) + lambda + 0.5); //Round to nearest
649
+ return result;
650
+ }
651
+ return curand_poisson_gammainc4(state, (float)lambda);
652
+ }
653
+
654
+
655
+
656
+ /**
657
+ * \brief Return a Poisson-distributed unsigned int from a MRG32k3A generator.
658
+ *
659
+ * Return a single unsigned int from a Poisson
660
+ * distribution with lambda \p lambda from the MRG32k3a generator in \p state,
661
+ * increment the position of the generator by a variable amount, depending
662
+ * on the algorithm used.
663
+ *
664
+ * \param state - Pointer to state to update
665
+ * \param lambda - Lambda of the Poisson distribution
666
+ *
667
+ * \return Poisson-distributed unsigned int with lambda \p lambda
668
+ */
669
+ QUALIFIERS unsigned int curand_poisson(curandStateMRG32k3a_t *state, double lambda)
670
+ {
671
+ if (lambda < 64)
672
+ return curand_poisson_knuth(state, (float)lambda);
673
+ if (lambda > 4000)
674
+ return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
675
+ return curand_poisson_gammainc(state, (float)lambda);
676
+ }
677
+
678
+ /**
679
+ * \brief Return a Poisson-distributed unsigned int from a MTGP32 generator.
680
+ *
681
+ * Return a single int from a Poisson
682
+ * distribution with lambda \p lambda from the MTGP32 generator in \p state,
683
+ * increment the position of the generator by one.
684
+ *
685
+ * \param state - Pointer to state to update
686
+ * \param lambda - Lambda of the Poisson distribution
687
+ *
688
+ * \return Poisson-distributed unsigned int with lambda \p lambda
689
+ */
690
+ QUALIFIERS unsigned int curand_poisson(curandStateMtgp32_t *state, double lambda)
691
+ {
692
+ return _curand_poisson(curand(state), lambda);
693
+ }
694
+
695
+ /**
696
+ * \brief Return a Poisson-distributed unsigned int from a Sobol32 generator.
697
+ *
698
+ * Return a single unsigned int from a Poisson
699
+ * distribution with lambda \p lambda from the Sobol32 generator in \p state,
700
+ * increment the position of the generator by one.
701
+ *
702
+ * \param state - Pointer to state to update
703
+ * \param lambda - Lambda of the Poisson distribution
704
+ *
705
+ * \return Poisson-distributed unsigned int with lambda \p lambda
706
+ */
707
+
708
+ QUALIFIERS unsigned int curand_poisson(curandStateSobol32_t *state, double lambda)
709
+ {
710
+ return _curand_poisson(curand(state), lambda);
711
+ }
712
+
713
+ /**
714
+ * \brief Return a Poisson-distributed unsigned int from a scrambled Sobol32 generator.
715
+ *
716
+ * Return a single unsigned int from a Poisson
717
+ * distribution with lambda \p lambda from the scrambled Sobol32 generator in \p state,
718
+ * increment the position of the generator by one.
719
+ *
720
+ * \param state - Pointer to state to update
721
+ * \param lambda - Lambda of the Poisson distribution
722
+ *
723
+ * \return Poisson-distributed unsigned int with lambda \p lambda
724
+ */
725
+ QUALIFIERS unsigned int curand_poisson(curandStateScrambledSobol32_t *state, double lambda)
726
+ {
727
+ return _curand_poisson(curand(state), lambda);
728
+ }
729
+
730
+ /**
731
+ * \brief Return a Poisson-distributed unsigned int from a Sobol64 generator.
732
+ *
733
+ * Return a single unsigned int from a Poisson
734
+ * distribution with lambda \p lambda from the Sobol64 generator in \p state,
735
+ * increment position of generator by one.
736
+ *
737
+ * \param state - Pointer to state to update
738
+ * \param lambda - Lambda of the Poisson distribution
739
+ *
740
+ * \return Poisson-distributed unsigned int with lambda \p lambda
741
+ */
742
+ QUALIFIERS unsigned int curand_poisson(curandStateSobol64_t *state, double lambda)
743
+ {
744
+ return _curand_poisson(curand(state), lambda);
745
+ }
746
+
747
+ /**
748
+ * \brief Return a Poisson-distributed unsigned int from a scrambled Sobol64 generator.
749
+ *
750
+ * Return a single unsigned int from a Poisson
751
+ * distribution with lambda \p lambda from the scrambled Sobol64 generator in \p state,
752
+ * increment position of generator by one.
753
+ *
754
+ * \param state - Pointer to state to update
755
+ * \param lambda - Lambda of the Poisson distribution
756
+ *
757
+ * \return Poisson-distributed unsigned int with lambda \p lambda
758
+ */
759
+ QUALIFIERS unsigned int curand_poisson(curandStateScrambledSobol64_t *state, double lambda)
760
+ {
761
+ return _curand_poisson(curand(state), lambda);
762
+ }
763
+ #endif // !defined(CURAND_POISSON_H_)
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_precalc.h ADDED
The diff for this file is too large to render. See raw diff
 
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_uniform.h ADDED
@@ -0,0 +1,498 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ /* Copyright 2010-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * The source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * The Licensed Deliverables contained herein are PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. THEY ARE
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+
51
+ #if !defined(CURAND_UNIFORM_H_)
52
+ #define CURAND_UNIFORM_H_
53
+
54
+ /**
55
+ * \defgroup DEVICE Device API
56
+ *
57
+ * @{
58
+ */
59
+
60
+ #ifndef __CUDACC_RTC__
61
+ #include <math.h>
62
+ #endif // __CUDACC_RTC__
63
+
64
+ #include "curand_mrg32k3a.h"
65
+ #include "curand_mtgp32_kernel.h"
66
+ #include "curand_philox4x32_x.h"
67
+
68
+
69
+ QUALIFIERS float _curand_uniform(unsigned int x)
70
+ {
71
+ return x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
72
+ }
73
+
74
+ QUALIFIERS float4 _curand_uniform4(uint4 x)
75
+ {
76
+ float4 y;
77
+ y.x = x.x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
78
+ y.y = x.y * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
79
+ y.z = x.z * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
80
+ y.w = x.w * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
81
+ return y;
82
+ }
83
+
84
+ QUALIFIERS float _curand_uniform(unsigned long long x)
85
+ {
86
+ unsigned int t;
87
+ t = (unsigned int)(x >> 32);
88
+ return t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
89
+ }
90
+
91
+ QUALIFIERS double _curand_uniform_double(unsigned int x)
92
+ {
93
+ return x * CURAND_2POW32_INV_DOUBLE + CURAND_2POW32_INV_DOUBLE;
94
+ }
95
+
96
+ QUALIFIERS double _curand_uniform_double(unsigned long long x)
97
+ {
98
+ return (x >> 11) * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
99
+ }
100
+
101
+ QUALIFIERS double _curand_uniform_double_hq(unsigned int x, unsigned int y)
102
+ {
103
+ unsigned long long z = (unsigned long long)x ^
104
+ ((unsigned long long)y << (53 - 32));
105
+ return z * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
106
+ }
107
+
108
+ QUALIFIERS float curand_uniform(curandStateTest_t *state)
109
+ {
110
+ return _curand_uniform(curand(state));
111
+ }
112
+
113
+ QUALIFIERS double curand_uniform_double(curandStateTest_t *state)
114
+ {
115
+ return _curand_uniform_double(curand(state));
116
+ }
117
+
118
+ /**
119
+ * \brief Return a uniformly distributed float from an XORWOW generator.
120
+ *
121
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
122
+ * from the XORWOW generator in \p state, increment position of generator.
123
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
124
+ * point outputs are never returned.
125
+ *
126
+ * The implementation may use any number of calls to \p curand() to
127
+ * get enough random bits to create the return value. The current
128
+ * implementation uses one call.
129
+ *
130
+ * \param state - Pointer to state to update
131
+ *
132
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
133
+ */
134
+ QUALIFIERS float curand_uniform(curandStateXORWOW_t *state)
135
+ {
136
+ return _curand_uniform(curand(state));
137
+ }
138
+
139
+ /**
140
+ * \brief Return a uniformly distributed double from an XORWOW generator.
141
+ *
142
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
143
+ * from the XORWOW generator in \p state, increment position of generator.
144
+ * Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
145
+ * point outputs are never returned.
146
+ *
147
+ * The implementation may use any number of calls to \p curand() to
148
+ * get enough random bits to create the return value. The current
149
+ * implementation uses exactly two calls.
150
+ *
151
+ * \param state - Pointer to state to update
152
+ *
153
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
154
+ */
155
+ QUALIFIERS double curand_uniform_double(curandStateXORWOW_t *state)
156
+ {
157
+ unsigned int x, y;
158
+ x = curand(state);
159
+ y = curand(state);
160
+ return _curand_uniform_double_hq(x, y);
161
+ }
162
+ /**
163
+ * \brief Return a uniformly distributed float from an MRG32k3a generator.
164
+ *
165
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
166
+ * from the MRG32k3a generator in \p state, increment position of generator.
167
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
168
+ * point outputs are never returned.
169
+ *
170
+ * The implementation returns up to 23 bits of mantissa, with the minimum
171
+ * return value \f$ 2^{-32} \f$
172
+ *
173
+ * \param state - Pointer to state to update
174
+ *
175
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
176
+ */
177
+ QUALIFIERS float curand_uniform(curandStateMRG32k3a_t *state)
178
+ {
179
+ return ((float)(curand_MRG32k3a(state)*MRG32K3A_NORM));
180
+ }
181
+
182
+ /**
183
+ * \brief Return a uniformly distributed double from an MRG32k3a generator.
184
+ *
185
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
186
+ * from the MRG32k3a generator in \p state, increment position of generator.
187
+ * Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
188
+ * point outputs are never returned.
189
+ *
190
+ * Note the implementation returns at most 32 random bits of mantissa as
191
+ * outlined in the seminal paper by L'Ecuyer.
192
+ *
193
+ * \param state - Pointer to state to update
194
+ *
195
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
196
+ */
197
+ QUALIFIERS double curand_uniform_double(curandStateMRG32k3a_t *state)
198
+ {
199
+ return curand_MRG32k3a(state)*MRG32K3A_NORM;
200
+ }
201
+
202
+
203
+
204
+ /**
205
+ * \brief Return a uniformly distributed tuple of 2 doubles from an Philox4_32_10 generator.
206
+ *
207
+ * Return a uniformly distributed 2 doubles (double4) between \p 0.0 and \p 1.0
208
+ * from the Philox4_32_10 generator in \p state, increment position of generator by 4.
209
+ * Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
210
+ * point outputs are never returned.
211
+ *
212
+ * \param state - Pointer to state to update
213
+ *
214
+ * \return 2 uniformly distributed doubles between \p 0.0 and \p 1.0
215
+ */
216
+
217
+ QUALIFIERS double2 curand_uniform2_double(curandStatePhilox4_32_10_t *state)
218
+ {
219
+ uint4 _x;
220
+ double2 result;
221
+ _x = curand4(state);
222
+ result.x = _curand_uniform_double_hq(_x.x,_x.y);
223
+ result.y = _curand_uniform_double_hq(_x.z,_x.w);
224
+ return result;
225
+ }
226
+
227
+
228
+ // not a part of API
229
+ QUALIFIERS double4 curand_uniform4_double(curandStatePhilox4_32_10_t *state)
230
+ {
231
+ uint4 _x, _y;
232
+ double4 result;
233
+ _x = curand4(state);
234
+ _y = curand4(state);
235
+ result.x = _curand_uniform_double_hq(_x.x,_x.y);
236
+ result.y = _curand_uniform_double_hq(_x.z,_x.w);
237
+ result.z = _curand_uniform_double_hq(_y.x,_y.y);
238
+ result.w = _curand_uniform_double_hq(_y.z,_y.w);
239
+ return result;
240
+ }
241
+
242
+ /**
243
+ * \brief Return a uniformly distributed float from a Philox4_32_10 generator.
244
+ *
245
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
246
+ * from the Philox4_32_10 generator in \p state, increment position of generator.
247
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
248
+ * point outputs are never returned.
249
+ *
250
+ * \param state - Pointer to state to update
251
+ *
252
+ * \return uniformly distributed float between \p 0.0 and \p 1.0
253
+ *
254
+ */
255
+ QUALIFIERS float curand_uniform(curandStatePhilox4_32_10_t *state)
256
+ {
257
+ return _curand_uniform(curand(state));
258
+ }
259
+
260
+ /**
261
+ * \brief Return a uniformly distributed tuple of 4 floats from a Philox4_32_10 generator.
262
+ *
263
+ * Return a uniformly distributed 4 floats between \p 0.0f and \p 1.0f
264
+ * from the Philox4_32_10 generator in \p state, increment position of generator by 4.
265
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
266
+ * point outputs are never returned.
267
+ *
268
+ * \param state - Pointer to state to update
269
+ *
270
+ * \return uniformly distributed float between \p 0.0 and \p 1.0
271
+ *
272
+ */
273
+ QUALIFIERS float4 curand_uniform4(curandStatePhilox4_32_10_t *state)
274
+ {
275
+ return _curand_uniform4(curand4(state));
276
+ }
277
+
278
+ /**
279
+ * \brief Return a uniformly distributed float from a MTGP32 generator.
280
+ *
281
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
282
+ * from the MTGP32 generator in \p state, increment position of generator.
283
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
284
+ * point outputs are never returned.
285
+ *
286
+ * \param state - Pointer to state to update
287
+ *
288
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
289
+ */
290
+ QUALIFIERS float curand_uniform(curandStateMtgp32_t *state)
291
+ {
292
+ return _curand_uniform(curand(state));
293
+ }
294
+ /**
295
+ * \brief Return a uniformly distributed double from a MTGP32 generator.
296
+ *
297
+ * Return a uniformly distributed double between \p 0.0f and \p 1.0f
298
+ * from the MTGP32 generator in \p state, increment position of generator.
299
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
300
+ * point outputs are never returned.
301
+ *
302
+ * Note that the implementation uses only 32 random bits to generate a single double
303
+ * precision value.
304
+ *
305
+ * \param state - Pointer to state to update
306
+ *
307
+ * \return uniformly distributed double between \p 0.0f and \p 1.0f
308
+ */
309
+ QUALIFIERS double curand_uniform_double(curandStateMtgp32_t *state)
310
+ {
311
+ return _curand_uniform_double(curand(state));
312
+ }
313
+
314
+ /**
315
+ * \brief Return a uniformly distributed double from a Philox4_32_10 generator.
316
+ *
317
+ * Return a uniformly distributed double between \p 0.0f and \p 1.0f
318
+ * from the Philox4_32_10 generator in \p state, increment position of generator.
319
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
320
+ * point outputs are never returned.
321
+ *
322
+ * Note that the implementation uses only 32 random bits to generate a single double
323
+ * precision value.
324
+ *
325
+ * \p curand_uniform2_double() is recommended for higher quality uniformly distributed
326
+ * double precision values.
327
+ *
328
+ * \param state - Pointer to state to update
329
+ *
330
+ * \return uniformly distributed double between \p 0.0f and \p 1.0f
331
+ */
332
+
333
+ QUALIFIERS double curand_uniform_double(curandStatePhilox4_32_10_t *state)
334
+ {
335
+ return _curand_uniform_double(curand(state));
336
+ }
337
+
338
+
339
+ /**
340
+ * \brief Return a uniformly distributed float from a Sobol32 generator.
341
+ *
342
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
343
+ * from the Sobol32 generator in \p state, increment position of generator.
344
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
345
+ * point outputs are never returned.
346
+ *
347
+ * The implementation is guaranteed to use a single call to \p curand().
348
+ *
349
+ * \param state - Pointer to state to update
350
+ *
351
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
352
+ */
353
+ QUALIFIERS float curand_uniform(curandStateSobol32_t *state)
354
+ {
355
+ return _curand_uniform(curand(state));
356
+ }
357
+
358
+ /**
359
+ * \brief Return a uniformly distributed double from a Sobol32 generator.
360
+ *
361
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
362
+ * from the Sobol32 generator in \p state, increment position of generator.
363
+ * Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
364
+ * point outputs are never returned.
365
+ *
366
+ * The implementation is guaranteed to use a single call to \p curand()
367
+ * to preserve the quasirandom properties of the sequence.
368
+ *
369
+ * Note that the implementation uses only 32 random bits to generate a single double
370
+ * precision value.
371
+ *
372
+ * \param state - Pointer to state to update
373
+ *
374
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
375
+ */
376
+ QUALIFIERS double curand_uniform_double(curandStateSobol32_t *state)
377
+ {
378
+ return _curand_uniform_double(curand(state));
379
+ }
380
+ /**
381
+ * \brief Return a uniformly distributed float from a scrambled Sobol32 generator.
382
+ *
383
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
384
+ * from the scrambled Sobol32 generator in \p state, increment position of generator.
385
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
386
+ * point outputs are never returned.
387
+ *
388
+ * The implementation is guaranteed to use a single call to \p curand().
389
+ *
390
+ * \param state - Pointer to state to update
391
+ *
392
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
393
+ */
394
+ QUALIFIERS float curand_uniform(curandStateScrambledSobol32_t *state)
395
+ {
396
+ return _curand_uniform(curand(state));
397
+ }
398
+
399
+ /**
400
+ * \brief Return a uniformly distributed double from a scrambled Sobol32 generator.
401
+ *
402
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
403
+ * from the scrambled Sobol32 generator in \p state, increment position of generator.
404
+ * Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
405
+ * point outputs are never returned.
406
+ *
407
+ * The implementation is guaranteed to use a single call to \p curand()
408
+ * to preserve the quasirandom properties of the sequence.
409
+ *
410
+ * Note that the implementation uses only 32 random bits to generate a single double
411
+ * precision value.
412
+ *
413
+ * \param state - Pointer to state to update
414
+ *
415
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
416
+ */
417
+ QUALIFIERS double curand_uniform_double(curandStateScrambledSobol32_t *state)
418
+ {
419
+ return _curand_uniform_double(curand(state));
420
+ }
421
+ /**
422
+ * \brief Return a uniformly distributed float from a Sobol64 generator.
423
+ *
424
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
425
+ * from the Sobol64 generator in \p state, increment position of generator.
426
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
427
+ * point outputs are never returned.
428
+ *
429
+ * The implementation is guaranteed to use a single call to \p curand().
430
+ *
431
+ * \param state - Pointer to state to update
432
+ *
433
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
434
+ */
435
+ QUALIFIERS float curand_uniform(curandStateSobol64_t *state)
436
+ {
437
+ return _curand_uniform(curand(state));
438
+ }
439
+
440
+ /**
441
+ * \brief Return a uniformly distributed double from a Sobol64 generator.
442
+ *
443
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
444
+ * from the Sobol64 generator in \p state, increment position of generator.
445
+ * Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
446
+ * point outputs are never returned.
447
+ *
448
+ * The implementation is guaranteed to use a single call to \p curand()
449
+ * to preserve the quasirandom properties of the sequence.
450
+ *
451
+ * \param state - Pointer to state to update
452
+ *
453
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
454
+ */
455
+ QUALIFIERS double curand_uniform_double(curandStateSobol64_t *state)
456
+ {
457
+ return _curand_uniform_double(curand(state));
458
+ }
459
+ /**
460
+ * \brief Return a uniformly distributed float from a scrambled Sobol64 generator.
461
+ *
462
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
463
+ * from the scrambled Sobol64 generator in \p state, increment position of generator.
464
+ * Output range excludes \p 0.0f but includes \p 1.0f. Denormalized floating
465
+ * point outputs are never returned.
466
+ *
467
+ * The implementation is guaranteed to use a single call to \p curand().
468
+ *
469
+ * \param state - Pointer to state to update
470
+ *
471
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
472
+ */
473
+ QUALIFIERS float curand_uniform(curandStateScrambledSobol64_t *state)
474
+ {
475
+ return _curand_uniform(curand(state));
476
+ }
477
+
478
+ /**
479
+ * \brief Return a uniformly distributed double from a scrambled Sobol64 generator.
480
+ *
481
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
482
+ * from the scrambled Sobol64 generator in \p state, increment position of generator.
483
+ * Output range excludes \p 0.0 but includes \p 1.0. Denormalized floating
484
+ * point outputs are never returned.
485
+ *
486
+ * The implementation is guaranteed to use a single call to \p curand()
487
+ * to preserve the quasirandom properties of the sequence.
488
+ *
489
+ * \param state - Pointer to state to update
490
+ *
491
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
492
+ */
493
+ QUALIFIERS double curand_uniform_double(curandStateScrambledSobol64_t *state)
494
+ {
495
+ return _curand_uniform_double(curand(state));
496
+ }
497
+
498
+ #endif // !defined(CURAND_UNIFORM_H_)
.venv/lib/python3.11/site-packages/nvidia/curand/lib/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (190 Bytes). View file
 
.venv/lib/python3.11/site-packages/nvidia/nvtx/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (184 Bytes). View file
 
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (192 Bytes). View file
 
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExt.h ADDED
@@ -0,0 +1,1561 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ /** \file nvToolsExt.h
39
+ */
40
+
41
+ /* ========================================================================= */
42
+ /** \mainpage
43
+ * \tableofcontents
44
+ * \section INTRODUCTION Introduction
45
+ *
46
+ * The NVIDIA Tools Extension library is a set of functions that a
47
+ * developer can use to provide additional information to tools.
48
+ * The additional information is used by the tool to improve
49
+ * analysis and visualization of data.
50
+ *
51
+ * The library introduces close to zero overhead if no tool is
52
+ * attached to the application. The overhead when a tool is
53
+ * attached is specific to the tool.
54
+ *
55
+ * \section INITIALIZATION_SECTION Initialization
56
+ *
57
+ * Typically the tool's library that plugs into NVTX is indirectly
58
+ * loaded via enviromental properties that are platform specific.
59
+ * For some platform or special cases, the user may be required
60
+ * to instead explicity initialize instead though. This can also
61
+ * be helpful to control when the API loads a tool's library instead
62
+ * of what would typically be the first function call to emit info.
63
+ * For these rare case, see \ref INITIALIZATION for additional information.
64
+ *
65
+ * \section MARKERS_AND_RANGES Markers and Ranges
66
+ *
67
+ * Markers and ranges are used to describe events at a specific time (markers)
68
+ * or over a time span (ranges) during the execution of the application
69
+ * respectively.
70
+ *
71
+ * \subsection MARKERS Markers
72
+ *
73
+ * Markers denote specific moments in time.
74
+ *
75
+ *
76
+ * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
77
+ * how to specify the domain.
78
+ *
79
+ * \subsection THREAD_RANGES Thread Ranges
80
+ *
81
+ * Thread ranges denote nested time ranges. Nesting is maintained per thread
82
+ * per domain and does not require any additional correlation mechanism. The
83
+ * duration of a thread range is defined by the corresponding pair of
84
+ * nvtxRangePush* to nvtxRangePop API calls.
85
+ *
86
+ * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
87
+ * how to specify the domain.
88
+ *
89
+ * \subsection PROCESS_RANGES Process Ranges
90
+ *
91
+ * Process ranges denote a time span that can expose arbitrary concurrency, as
92
+ * opposed to thread ranges that only support nesting. In addition the range
93
+ * start event can happen on a different thread than the end marker. For the
94
+ * correlation of a start/end pair an unique correlation ID is used that is
95
+ * returned from the start API call and needs to be passed into the end API
96
+ * call.
97
+ *
98
+ * \subsection EVENT_ATTRIBUTES Event Attributes
99
+ *
100
+ * \ref MARKERS_AND_RANGES can be annotated with various attributes to provide
101
+ * additional information for an event or to guide the tool's visualization of
102
+ * the data. Each of the attributes is optional and if left unused the
103
+ * attributes fall back to a default value. The attributes include:
104
+ * - color
105
+ * - category
106
+ *
107
+ * To specify any attribute other than the text message, the \ref
108
+ * EVENT_ATTRIBUTE_STRUCTURE "Event Attribute Structure" must be used.
109
+ *
110
+ * \section DOMAINS Domains
111
+ *
112
+ * Domains enable developers to scope annotations. By default all events and
113
+ * annotations are in the default domain. Additional domains can be registered.
114
+ * This allows developers to scope markers, ranges, and resources names to
115
+ * avoid conflicts.
116
+ *
117
+ * The function ::nvtxDomainCreateA or ::nvtxDomainCreateW is used to create
118
+ * a named domain.
119
+ *
120
+ * Each domain maintains its own
121
+ * - categories
122
+ * - thread range stacks
123
+ * - registered strings
124
+ *
125
+ * The function ::nvtxDomainDestroy marks the end of the domain. Destroying
126
+ * a domain unregisters and destroys all objects associated with it such as
127
+ * registered strings, resource objects, named categories, and started ranges.
128
+ *
129
+ * \section RESOURCE_NAMING Resource Naming
130
+ *
131
+ * This section covers calls that allow to annotate objects with user-provided
132
+ * names in order to allow for a better analysis of complex trace data. All of
133
+ * the functions take the handle or the ID of the object to name and the name.
134
+ * The functions can be called multiple times during the execution of an
135
+ * application, however, in that case it is implementation dependent which
136
+ * name will be reported by the tool.
137
+ *
138
+ * \subsection CATEGORY_NAMING Category Naming
139
+ *
140
+ * Some function in this library support associating an integer category
141
+ * to enable filtering and sorting. The category naming functions allow
142
+ * the application to associate a user friendly name with the integer
143
+ * category. Support for domains have been added in NVTX_VERSION_2 to
144
+ * avoid collisions when domains are developed independantly.
145
+ *
146
+ * \subsection RESOURCE_OBJECTS Resource Objects
147
+ *
148
+ * Resource objects are a generic mechanism for attaching data to an application
149
+ * resource. The identifier field makes the association to a pointer or handle,
150
+ * while the type field helps provide deeper understanding of the identifier as
151
+ * well as enabling differentiation in cases where handles generated by different
152
+ * APIs may collide. The resource object may also have an associated message to
153
+ * associate with the application resource, enabling further annotation of this
154
+ * object and how it is used.
155
+ *
156
+ * The resource object was introduced in NVTX_VERSION_2 to supersede existing naming
157
+ * functions and allow the application resource identified by those functions to be
158
+ * associated to a domain. The other naming functions are still supported for backward
159
+ * compatibility but will be associated only to the default domain.
160
+ *
161
+ * \subsection RESOURCE_NAMING_OS Resource Naming
162
+ *
163
+ * Some operating system resources creation APIs do not support providing a user friendly
164
+ * name, such as some OS thread creation APIs. This API support resource naming though
165
+ * both through resource objects and functions following the pattern
166
+ * nvtxName[RESOURCE_TYPE][A|W](identifier, name). Resource objects introduced in NVTX_VERSION 2
167
+ * supersede the other functions with a a more general method of assigning names to OS resources,
168
+ * along with associating them to domains too. The older nvtxName* functions are only associated
169
+ * with the default domain.
170
+ * \section EXTENSIONS Optional Extensions
171
+ * Optional extensions will either appear within the existing sections the extend or appear
172
+ * in the "Related Pages" when they introduce new concepts.
173
+ */
174
+
175
+ #ifndef NVTOOLSEXT_H_
176
+ #define NVTOOLSEXT_H_
177
+
178
+ #if defined(_MSC_VER)
179
+ #ifdef NVTX_EXPORTS
180
+ #define NVTX_DECLSPEC
181
+ #else
182
+ #define NVTX_DECLSPEC __declspec(dllimport)
183
+ #endif /* NVTX_EXPORTS */
184
+ #define NVTX_API __stdcall
185
+ #define NVTX_INLINE_STATIC __inline static
186
+ #else /*defined(__GNUC__)*/
187
+ #define NVTX_DECLSPEC
188
+ #define NVTX_API
189
+ #define NVTX_INLINE_STATIC inline static
190
+ #endif /* Platform */
191
+
192
+ /**
193
+ * The nvToolsExt library depends on stdint.h. If the build tool chain in use
194
+ * does not include stdint.h then define NVTX_STDINT_TYPES_ALREADY_DEFINED
195
+ * and define the following types:
196
+ * <ul>
197
+ * <li>uint8_t
198
+ * <li>int8_t
199
+ * <li>uint16_t
200
+ * <li>int16_t
201
+ * <li>uint32_t
202
+ * <li>int32_t
203
+ * <li>uint64_t
204
+ * <li>int64_t
205
+ * <li>uintptr_t
206
+ * <li>intptr_t
207
+ * </ul>
208
+ #define NVTX_STDINT_TYPES_ALREADY_DEFINED if you are using your own header file.
209
+ */
210
+ #ifndef NVTX_STDINT_TYPES_ALREADY_DEFINED
211
+ #include <stdint.h>
212
+ #endif
213
+
214
+ #include <stddef.h>
215
+
216
+ #ifdef __cplusplus
217
+ extern "C" {
218
+ #endif /* __cplusplus */
219
+
220
+ /**
221
+ * Tools Extension API version
222
+ */
223
+ #define NVTX_VERSION 2
224
+
225
+ /**
226
+ * Size of the nvtxEventAttributes_t structure.
227
+ */
228
+ #define NVTX_EVENT_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxEventAttributes_t) ) )
229
+
230
+ /**
231
+ * Size of the nvtxInitializationAttributes_t structure.
232
+ */
233
+ #define NVTX_INITIALIZATION_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxInitializationAttributes_t) ) )
234
+
235
+ #define NVTX_NO_PUSH_POP_TRACKING ((int)-2)
236
+
237
+ typedef uint64_t nvtxRangeId_t;
238
+
239
+
240
+ /* \brief String Handle Structure.
241
+ * \anchor STRING_HANDLE_STRUCTURE
242
+ *
243
+ * This structure is opaque to the user and is used as a handle to reference
244
+ * a string. The tools will return a pointer through the API for the application
245
+ * to hold on it's behalf to reference the string in the future.
246
+ *
247
+ */
248
+ typedef struct nvtxStringHandle* nvtxStringHandle_t;
249
+
250
+ /* \brief Domain Handle Structure.
251
+ * \anchor DOMAIN_HANDLE_STRUCTURE
252
+ *
253
+ * This structure is opaque to the user and is used as a handle to reference
254
+ * a domain. The tools will return a pointer through the API for the application
255
+ * to hold on its behalf to reference the domain in the future.
256
+ *
257
+ */
258
+ typedef struct nvtxDomainHandle* nvtxDomainHandle_t;
259
+
260
+
261
+
262
+
263
+
264
+
265
+ /* ========================================================================= */
266
+ /** \defgroup GENERAL General
267
+ * @{
268
+ */
269
+
270
+ /** ---------------------------------------------------------------------------
271
+ * Color Types
272
+ * ------------------------------------------------------------------------- */
273
+ typedef enum nvtxColorType_t
274
+ {
275
+ NVTX_COLOR_UNKNOWN = 0, /**< Color attribute is unused. */
276
+ NVTX_COLOR_ARGB = 1 /**< An ARGB color is provided. */
277
+ } nvtxColorType_t;
278
+
279
+ /** ---------------------------------------------------------------------------
280
+ * Message Types
281
+ * ------------------------------------------------------------------------- */
282
+ typedef enum nvtxMessageType_t
283
+ {
284
+ NVTX_MESSAGE_UNKNOWN = 0, /**< Message payload is unused. */
285
+ NVTX_MESSAGE_TYPE_ASCII = 1, /**< A character sequence is used as payload. */
286
+ NVTX_MESSAGE_TYPE_UNICODE = 2, /**< A wide character sequence is used as payload. */
287
+ /* NVTX_VERSION_2 */
288
+ NVTX_MESSAGE_TYPE_REGISTERED = 3 /**< A unique string handle that was registered
289
+ with \ref nvtxDomainRegisterStringA() or
290
+ \ref nvtxDomainRegisterStringW(). */
291
+ } nvtxMessageType_t;
292
+
293
+ typedef union nvtxMessageValue_t
294
+ {
295
+ const char* ascii;
296
+ const wchar_t* unicode;
297
+ /* NVTX_VERSION_2 */
298
+ nvtxStringHandle_t registered;
299
+ } nvtxMessageValue_t;
300
+
301
+
302
+ /** @} */ /*END defgroup*/
303
+
304
+ /* ========================================================================= */
305
+ /** \defgroup INITIALIZATION Initialization
306
+ * @{
307
+ * Typically the tool's library that plugs into NVTX is indirectly
308
+ * loaded via enviromental properties that are platform specific.
309
+ * For some platform or special cases, the user may be required
310
+ * to instead explicity initialize instead though. This can also
311
+ * be helpful to control when the API loads a tool's library instead
312
+ * of what would typically be the first function call to emit info.
313
+ */
314
+
315
+ /** ---------------------------------------------------------------------------
316
+ * Initialization Modes
317
+ * ------------------------------------------------------------------------- */
318
+ typedef enum nvtxInitializationMode_t
319
+ {
320
+ NVTX_INITIALIZATION_MODE_UNKNOWN = 0, /**< A platform that supports indirect initialization will attempt this style, otherwise expect failure. */
321
+ NVTX_INITIALIZATION_MODE_CALLBACK_V1 = 1, /**< A function pointer conforming to NVTX_VERSION=1 will be used. */
322
+ NVTX_INITIALIZATION_MODE_CALLBACK_V2 = 2, /**< A function pointer conforming to NVTX_VERSION=2 will be used. */
323
+ NVTX_INITIALIZATION_MODE_SIZE
324
+ } nvtxInitializationMode_t;
325
+
326
+
327
+ /** \brief Initialization Attribute Structure.
328
+ * \anchor INITIALIZATION_ATTRIBUTE_STRUCTURE
329
+ *
330
+ * This structure is used to describe the attributes used for initialization
331
+ * of the NVTX API.
332
+ *
333
+ * \par Initializing the Attributes
334
+ *
335
+ * The caller should always perform the following three tasks when using
336
+ * attributes:
337
+ * <ul>
338
+ * <li>Zero the structure
339
+ * <li>Set the version field
340
+ * <li>Set the size field
341
+ * </ul>
342
+ *
343
+ * Zeroing the structure sets all the event attributes types and values
344
+ * to the default value.
345
+ *
346
+ * The version and size field are used by the Tools Extension
347
+ * implementation to handle multiple versions of the attributes structure.
348
+ * NVTX_INITIALIZATION_ATTRIB_STRUCT_SIZE may be used for the size.
349
+ *
350
+ * It is recommended that the caller use one of the following to methods
351
+ * to initialize the event attributes structure:
352
+ *
353
+ * \par Method 1: Initializing nvtxInitializationAttributes_t for future compatibility
354
+ * \code
355
+ * nvtxInitializationAttributes_t initAttribs = {0};
356
+ * initAttribs.version = NVTX_VERSION;
357
+ * initAttribs.size = NVTX_INITIALIZATION_ATTRIB_STRUCT_SIZE;
358
+ * \endcode
359
+ *
360
+ * \par Method 2: Initializing nvtxInitializationAttributes_t for a specific version
361
+ * \code
362
+ * nvtxInitializationAttributes_t initAttribs = {0};
363
+ * initAttribs.version =2;
364
+ * initAttribs.size = (uint16_t)(sizeof(nvtxInitializationAttributes_v2));
365
+ * \endcode
366
+ *
367
+ * If the caller uses Method 1 it is critical that the entire binary
368
+ * layout of the structure be configured to 0 so that all fields
369
+ * are initialized to the default value.
370
+ *
371
+ * The caller should either use both NVTX_VERSION and
372
+ * NVTX_INITIALIZATION_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
373
+ * and a versioned type (Method 2). Using a mix of the two methods
374
+ * will likely cause either source level incompatibility or binary
375
+ * incompatibility in the future.
376
+ *
377
+ * \par Settings Attribute Types and Values
378
+ *
379
+ *
380
+ * \par Example:
381
+ * \code
382
+ * // Initialize
383
+ * nvtxInitializationAttributes_t initAttribs = {0};
384
+ * initAttribs.version = NVTX_VERSION;
385
+ * initAttribs.size = NVTX_INITIALIZATION_ATTRIB_STRUCT_SIZE;
386
+ *
387
+ * // Configure the Attributes
388
+ * initAttribs.mode = NVTX_INITIALIZATION_MODE_CALLBACK_V2;
389
+ * initAttribs.fnptr = InitializeInjectionNvtx2;
390
+ * \endcode
391
+
392
+ * \sa
393
+ * ::nvtxInitializationMode_t
394
+ * ::nvtxInitialize
395
+ */
396
+ typedef struct nvtxInitializationAttributes_v2
397
+ {
398
+ /**
399
+ * \brief Version flag of the structure.
400
+ *
401
+ * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
402
+ * supported in this header file. This can optionally be overridden to
403
+ * another version of the tools extension library.
404
+ */
405
+ uint16_t version;
406
+
407
+ /**
408
+ * \brief Size of the structure.
409
+ *
410
+ * Needs to be set to the size in bytes of the event attribute
411
+ * structure used to specify the event.
412
+ */
413
+ uint16_t size;
414
+
415
+ /**
416
+ * \brief Mode of initialization.
417
+ *
418
+ * The mode of initialization dictates the overall behavior and which
419
+ * attributes in this struct will be used.
420
+ *
421
+ * Default Value is NVTX_INITIALIZATION_MODE_UNKNOWN = 0
422
+ * \sa
423
+ * ::nvtxInitializationMode_t
424
+ */
425
+ uint32_t mode;
426
+
427
+ /**
428
+ * \brief Function pointer used for initialization if the mode requires
429
+ *
430
+ * The user has retrieved this function pointer from the tool library
431
+ * and would like to use it to initialize. The mode must be set to a
432
+ * NVTX_INITIALIZATION_MODE_CALLBACK_V# for this to be used. The mode
433
+ * will dictate the expectations for this member. The function signature
434
+ * will be cast from void(*)() to the appropriate signature for the mode.
435
+ * the expected behavior of the function will also depend on the mode
436
+ * beyond the simple function signature.
437
+ *
438
+ * Default Value is NVTX_INITIALIZATION_MODE_UNKNOWN which will either
439
+ * initialize based on external properties or fail if not supported on
440
+ * the given platform.
441
+
442
+ * \sa
443
+ * ::nvtxInitializationMode_t
444
+ */
445
+ void(*fnptr)(void);
446
+
447
+ } nvtxInitializationAttributes_v2;
448
+
449
+ typedef struct nvtxInitializationAttributes_v2 nvtxInitializationAttributes_t;
450
+
451
+
452
+ /* ------------------------------------------------------------------------- */
453
+ /** \brief Force initialization (optional on most platforms)
454
+ *
455
+ * Force NVTX library to initialize. On some platform NVTX will implicit initialize
456
+ * upon the first function call into an NVTX API.
457
+ *
458
+ * \return Result codes are simplest to assume NVTX_SUCCESS or !NVTX_SUCCESS
459
+ *
460
+ * \param initAttrib - The initialization attribute structure
461
+ *
462
+ * \sa
463
+ * ::nvtxInitializationAttributes_t
464
+ *
465
+ * \version \NVTX_VERSION_2
466
+ * @{ */
467
+ NVTX_DECLSPEC int NVTX_API nvtxInitialize(const nvtxInitializationAttributes_t* initAttrib);
468
+ /** @} */
469
+
470
+
471
+ /** @} */ /*END defgroup*/
472
+
473
+ /* ========================================================================= */
474
+ /** \defgroup EVENT_ATTRIBUTES Event Attributes
475
+ * @{
476
+ */
477
+
478
+ /** ---------------------------------------------------------------------------
479
+ * Payload Types
480
+ * ------------------------------------------------------------------------- */
481
+ typedef enum nvtxPayloadType_t
482
+ {
483
+ NVTX_PAYLOAD_UNKNOWN = 0, /**< Color payload is unused. */
484
+ NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 = 1, /**< A 64 bit unsigned integer value is used as payload. */
485
+ NVTX_PAYLOAD_TYPE_INT64 = 2, /**< A 64 bit signed integer value is used as payload. */
486
+ NVTX_PAYLOAD_TYPE_DOUBLE = 3, /**< A 64 bit floating point value is used as payload. */
487
+ /* NVTX_VERSION_2 */
488
+ NVTX_PAYLOAD_TYPE_UNSIGNED_INT32 = 4, /**< A 32 bit floating point value is used as payload. */
489
+ NVTX_PAYLOAD_TYPE_INT32 = 5, /**< A 32 bit floating point value is used as payload. */
490
+ NVTX_PAYLOAD_TYPE_FLOAT = 6 /**< A 32 bit floating point value is used as payload. */
491
+ } nvtxPayloadType_t;
492
+
493
+ /** \brief Event Attribute Structure.
494
+ * \anchor EVENT_ATTRIBUTE_STRUCTURE
495
+ *
496
+ * This structure is used to describe the attributes of an event. The layout of
497
+ * the structure is defined by a specific version of the tools extension
498
+ * library and can change between different versions of the Tools Extension
499
+ * library.
500
+ *
501
+ * \par Initializing the Attributes
502
+ *
503
+ * The caller should always perform the following three tasks when using
504
+ * attributes:
505
+ * <ul>
506
+ * <li>Zero the structure
507
+ * <li>Set the version field
508
+ * <li>Set the size field
509
+ * </ul>
510
+ *
511
+ * Zeroing the structure sets all the event attributes types and values
512
+ * to the default value.
513
+ *
514
+ * The version and size field are used by the Tools Extension
515
+ * implementation to handle multiple versions of the attributes structure.
516
+ *
517
+ * It is recommended that the caller use one of the following to methods
518
+ * to initialize the event attributes structure:
519
+ *
520
+ * \par Method 1: Initializing nvtxEventAttributes for future compatibility
521
+ * \code
522
+ * nvtxEventAttributes_t eventAttrib = {0};
523
+ * eventAttrib.version = NVTX_VERSION;
524
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
525
+ * \endcode
526
+ *
527
+ * \par Method 2: Initializing nvtxEventAttributes for a specific version
528
+ * \code
529
+ * nvtxEventAttributes_t eventAttrib = {0};
530
+ * eventAttrib.version = 1;
531
+ * eventAttrib.size = (uint16_t)(sizeof(nvtxEventAttributes_v1));
532
+ * \endcode
533
+ *
534
+ * If the caller uses Method 1 it is critical that the entire binary
535
+ * layout of the structure be configured to 0 so that all fields
536
+ * are initialized to the default value.
537
+ *
538
+ * The caller should either use both NVTX_VERSION and
539
+ * NVTX_EVENT_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
540
+ * and a versioned type (Method 2). Using a mix of the two methods
541
+ * will likely cause either source level incompatibility or binary
542
+ * incompatibility in the future.
543
+ *
544
+ * \par Settings Attribute Types and Values
545
+ *
546
+ *
547
+ * \par Example:
548
+ * \code
549
+ * // Initialize
550
+ * nvtxEventAttributes_t eventAttrib = {0};
551
+ * eventAttrib.version = NVTX_VERSION;
552
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
553
+ *
554
+ * // Configure the Attributes
555
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
556
+ * eventAttrib.color = 0xFF880000;
557
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
558
+ * eventAttrib.message.ascii = "Example";
559
+ * \endcode
560
+ *
561
+ * In the example the caller does not have to set the value of
562
+ * \ref ::nvtxEventAttributes_v2::category or
563
+ * \ref ::nvtxEventAttributes_v2::payload as these fields were set to
564
+ * the default value by {0}.
565
+ * \sa
566
+ * ::nvtxDomainMarkEx
567
+ * ::nvtxDomainRangeStartEx
568
+ * ::nvtxDomainRangePushEx
569
+ */
570
+ typedef struct nvtxEventAttributes_v2
571
+ {
572
+ /**
573
+ * \brief Version flag of the structure.
574
+ *
575
+ * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
576
+ * supported in this header file. This can optionally be overridden to
577
+ * another version of the tools extension library.
578
+ */
579
+ uint16_t version;
580
+
581
+ /**
582
+ * \brief Size of the structure.
583
+ *
584
+ * Needs to be set to the size in bytes of the event attribute
585
+ * structure used to specify the event.
586
+ */
587
+ uint16_t size;
588
+
589
+ /**
590
+ * \brief ID of the category the event is assigned to.
591
+ *
592
+ * A category is a user-controlled ID that can be used to group
593
+ * events. The tool may use category IDs to improve filtering or
594
+ * enable grouping of events in the same category. The functions
595
+ * \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used
596
+ * to name a category.
597
+ *
598
+ * Default Value is 0
599
+ */
600
+ uint32_t category;
601
+
602
+ /** \brief Color type specified in this attribute structure.
603
+ *
604
+ * Defines the color format of the attribute structure's \ref COLOR_FIELD
605
+ * "color" field.
606
+ *
607
+ * Default Value is NVTX_COLOR_UNKNOWN
608
+ */
609
+ int32_t colorType; /* nvtxColorType_t */
610
+
611
+ /** \brief Color assigned to this event. \anchor COLOR_FIELD
612
+ *
613
+ * The color that the tool should use to visualize the event.
614
+ */
615
+ uint32_t color;
616
+
617
+ /**
618
+ * \brief Payload type specified in this attribute structure.
619
+ *
620
+ * Defines the payload format of the attribute structure's \ref PAYLOAD_FIELD
621
+ * "payload" field.
622
+ *
623
+ * Default Value is NVTX_PAYLOAD_UNKNOWN
624
+ */
625
+ int32_t payloadType; /* nvtxPayloadType_t */
626
+
627
+ int32_t reserved0;
628
+
629
+ /**
630
+ * \brief Payload assigned to this event. \anchor PAYLOAD_FIELD
631
+ *
632
+ * A numerical value that can be used to annotate an event. The tool could
633
+ * use the payload data to reconstruct graphs and diagrams.
634
+ */
635
+ union payload_t
636
+ {
637
+ uint64_t ullValue;
638
+ int64_t llValue;
639
+ double dValue;
640
+ /* NVTX_VERSION_2 */
641
+ uint32_t uiValue;
642
+ int32_t iValue;
643
+ float fValue;
644
+ } payload;
645
+
646
+ /** \brief Message type specified in this attribute structure.
647
+ *
648
+ * Defines the message format of the attribute structure's \ref MESSAGE_FIELD
649
+ * "message" field.
650
+ *
651
+ * Default Value is NVTX_MESSAGE_UNKNOWN
652
+ */
653
+ int32_t messageType; /* nvtxMessageType_t */
654
+
655
+ /** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD
656
+ *
657
+ * The text message that is attached to an event.
658
+ */
659
+ nvtxMessageValue_t message;
660
+
661
+ } nvtxEventAttributes_v2;
662
+
663
+ typedef struct nvtxEventAttributes_v2 nvtxEventAttributes_t;
664
+
665
+ /** @} */ /*END defgroup*/
666
+ /* ========================================================================= */
667
+ /** \defgroup MARKERS_AND_RANGES Markers and Ranges
668
+ *
669
+ * See \ref MARKERS_AND_RANGES for more details
670
+ *
671
+ * @{
672
+ */
673
+
674
+ /** \name Marker */
675
+
676
+ /* ------------------------------------------------------------------------- */
677
+ /** \brief Marks an instantaneous event in the application.
678
+ *
679
+ * A marker can contain a text message or specify additional information
680
+ * using the event attributes structure. These attributes include a text
681
+ * message, color, category, and a payload. Each of the attributes is optional
682
+ * and can only be sent out using the \ref nvtxDomainMarkEx function.
683
+ *
684
+ * nvtxDomainMarkEx(NULL, event) is equivalent to calling
685
+ * nvtxMarkEx(event).
686
+ *
687
+ * \param domain - The domain of scoping the category.
688
+ * \param eventAttrib - The event attribute structure defining the marker's
689
+ * attribute types and attribute values.
690
+ *
691
+ * \sa
692
+ * ::nvtxMarkEx
693
+ *
694
+ * \version \NVTX_VERSION_2
695
+ * @{ */
696
+ NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
697
+ /** @} */
698
+
699
+ /* ------------------------------------------------------------------------- */
700
+ /** \brief Marks an instantaneous event in the application.
701
+ *
702
+ * A marker can contain a text message or specify additional information
703
+ * using the event attributes structure. These attributes include a text
704
+ * message, color, category, and a payload. Each of the attributes is optional
705
+ * and can only be sent out using the \ref nvtxMarkEx function.
706
+ * If \ref nvtxMarkA or \ref nvtxMarkW are used to specify the marker
707
+ * or if an attribute is unspecified then a default value will be used.
708
+ *
709
+ * \param eventAttrib - The event attribute structure defining the marker's
710
+ * attribute types and attribute values.
711
+ *
712
+ * \par Example:
713
+ * \code
714
+ * // zero the structure
715
+ * nvtxEventAttributes_t eventAttrib = {0};
716
+ * // set the version and the size information
717
+ * eventAttrib.version = NVTX_VERSION;
718
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
719
+ * // configure the attributes. 0 is the default for all attributes.
720
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
721
+ * eventAttrib.color = 0xFF880000;
722
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
723
+ * eventAttrib.message.ascii = "Example nvtxMarkEx";
724
+ * nvtxMarkEx(&eventAttrib);
725
+ * \endcode
726
+ *
727
+ * \sa
728
+ * ::nvtxDomainMarkEx
729
+ *
730
+ * \version \NVTX_VERSION_1
731
+ * @{ */
732
+ NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib);
733
+ /** @} */
734
+
735
+ /* ------------------------------------------------------------------------- */
736
+ /** \brief Marks an instantaneous event in the application.
737
+ *
738
+ * A marker created using \ref nvtxMarkA or \ref nvtxMarkW contains only a
739
+ * text message.
740
+ *
741
+ * \param message - The message associated to this marker event.
742
+ *
743
+ * \par Example:
744
+ * \code
745
+ * nvtxMarkA("Example nvtxMarkA");
746
+ * nvtxMarkW(L"Example nvtxMarkW");
747
+ * \endcode
748
+ *
749
+ * \sa
750
+ * ::nvtxDomainMarkEx
751
+ * ::nvtxMarkEx
752
+ *
753
+ * \version \NVTX_VERSION_0
754
+ * @{ */
755
+ NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message);
756
+ NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message);
757
+ /** @} */
758
+
759
+
760
+ /** \name Process Ranges */
761
+
762
+ /* ------------------------------------------------------------------------- */
763
+ /** \brief Starts a process range in a domain.
764
+ *
765
+ * \param domain - The domain of scoping the category.
766
+ * \param eventAttrib - The event attribute structure defining the range's
767
+ * attribute types and attribute values.
768
+ *
769
+ * \return The unique ID used to correlate a pair of Start and End events.
770
+ *
771
+ * \remarks Ranges defined by Start/End can overlap.
772
+ *
773
+ * \par Example:
774
+ * \code
775
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain");
776
+ * nvtxEventAttributes_t eventAttrib = {0};
777
+ * eventAttrib.version = NVTX_VERSION;
778
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
779
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
780
+ * eventAttrib.message.ascii = "my range";
781
+ * nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib);
782
+ * // ...
783
+ * nvtxDomainRangeEnd(rangeId);
784
+ * \endcode
785
+ *
786
+ * \sa
787
+ * ::nvtxDomainRangeEnd
788
+ *
789
+ * \version \NVTX_VERSION_2
790
+ * @{ */
791
+ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
792
+ /** @} */
793
+
794
+ /* ------------------------------------------------------------------------- */
795
+ /** \brief Starts a process range.
796
+ *
797
+ * \param eventAttrib - The event attribute structure defining the range's
798
+ * attribute types and attribute values.
799
+ *
800
+ * \return The unique ID used to correlate a pair of Start and End events.
801
+ *
802
+ * \remarks Ranges defined by Start/End can overlap.
803
+ *
804
+ * \par Example:
805
+ * \code
806
+ * nvtxEventAttributes_t eventAttrib = {0};
807
+ * eventAttrib.version = NVTX_VERSION;
808
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
809
+ * eventAttrib.category = 3;
810
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
811
+ * eventAttrib.color = 0xFF0088FF;
812
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
813
+ * eventAttrib.message.ascii = "Example Range";
814
+ * nvtxRangeId_t rangeId = nvtxRangeStartEx(&eventAttrib);
815
+ * // ...
816
+ * nvtxRangeEnd(rangeId);
817
+ * \endcode
818
+ *
819
+ * \sa
820
+ * ::nvtxRangeEnd
821
+ * ::nvtxDomainRangeStartEx
822
+ *
823
+ * \version \NVTX_VERSION_1
824
+ * @{ */
825
+ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib);
826
+ /** @} */
827
+
828
+ /* ------------------------------------------------------------------------- */
829
+ /** \brief Starts a process range.
830
+ *
831
+ * \param message - The event message associated to this range event.
832
+ *
833
+ * \return The unique ID used to correlate a pair of Start and End events.
834
+ *
835
+ * \remarks Ranges defined by Start/End can overlap.
836
+ *
837
+ * \par Example:
838
+ * \code
839
+ * nvtxRangeId_t r1 = nvtxRangeStartA("Range 1");
840
+ * nvtxRangeId_t r2 = nvtxRangeStartW(L"Range 2");
841
+ * nvtxRangeEnd(r1);
842
+ * nvtxRangeEnd(r2);
843
+ * \endcode
844
+ *
845
+ * \sa
846
+ * ::nvtxRangeEnd
847
+ * ::nvtxRangeStartEx
848
+ * ::nvtxDomainRangeStartEx
849
+ *
850
+ * \version \NVTX_VERSION_0
851
+ * @{ */
852
+ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message);
853
+ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message);
854
+ /** @} */
855
+
856
+ /* ------------------------------------------------------------------------- */
857
+ /** \brief Ends a process range.
858
+ *
859
+ * \param domain - The domain
860
+ * \param id - The correlation ID returned from a nvtxRangeStart call.
861
+ *
862
+ * \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd.
863
+ * It does not need a domain param since that is associated iwth the range ID at ::nvtxDomainRangeStartEx
864
+ *
865
+ * \par Example:
866
+ * \code
867
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain");
868
+ * nvtxEventAttributes_t eventAttrib = {0};
869
+ * eventAttrib.version = NVTX_VERSION;
870
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
871
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
872
+ * eventAttrib.message.ascii = "my range";
873
+ * nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib);
874
+ * // ...
875
+ * nvtxDomainRangeEnd(rangeId);
876
+ * \endcode
877
+ *
878
+ * \sa
879
+ * ::nvtxDomainRangeStartEx
880
+ *
881
+ * \version \NVTX_VERSION_2
882
+ * @{ */
883
+ NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id);
884
+ /** @} */
885
+
886
+ /* ------------------------------------------------------------------------- */
887
+ /** \brief Ends a process range.
888
+ *
889
+ * \param id - The correlation ID returned from an nvtxRangeStart call.
890
+ *
891
+ * \sa
892
+ * ::nvtxDomainRangeStartEx
893
+ * ::nvtxRangeStartEx
894
+ * ::nvtxRangeStartA
895
+ * ::nvtxRangeStartW
896
+ *
897
+ * \version \NVTX_VERSION_0
898
+ * @{ */
899
+ NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id);
900
+ /** @} */
901
+
902
+ /** \name Thread Ranges */
903
+
904
+ /* ------------------------------------------------------------------------- */
905
+ /** \brief Starts a nested thread range.
906
+ *
907
+ * \param domain - The domain of scoping.
908
+ * \param eventAttrib - The event attribute structure defining the range's
909
+ * attribute types and attribute values.
910
+ *
911
+ * \return The 0 based level of range being started. This value is scoped to the domain.
912
+ * If an error occurs, a negative value is returned.
913
+ *
914
+ * \par Example:
915
+ * \code
916
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
917
+ * nvtxEventAttributes_t eventAttrib = {0};
918
+ * eventAttrib.version = NVTX_VERSION;
919
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
920
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
921
+ * eventAttrib.color = 0xFFFF0000;
922
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
923
+ * eventAttrib.message.ascii = "Level 0";
924
+ * nvtxDomainRangePushEx(domain, &eventAttrib);
925
+ *
926
+ * // Re-use eventAttrib
927
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE;
928
+ * eventAttrib.message.unicode = L"Level 1";
929
+ * nvtxDomainRangePushEx(domain, &eventAttrib);
930
+ *
931
+ * nvtxDomainRangePop(domain); //level 1
932
+ * nvtxDomainRangePop(domain); //level 0
933
+ * \endcode
934
+ *
935
+ * \sa
936
+ * ::nvtxDomainRangePop
937
+ *
938
+ * \version \NVTX_VERSION_2
939
+ * @{ */
940
+ NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
941
+ /** @} */
942
+
943
+ /* ------------------------------------------------------------------------- */
944
+ /** \brief Starts a nested thread range.
945
+ *
946
+ * \param eventAttrib - The event attribute structure defining the range's
947
+ * attribute types and attribute values.
948
+ *
949
+ * \return The 0 based level of range being started. This level is per domain.
950
+ * If an error occurs a negative value is returned.
951
+ *
952
+ * \par Example:
953
+ * \code
954
+ * nvtxEventAttributes_t eventAttrib = {0};
955
+ * eventAttrib.version = NVTX_VERSION;
956
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
957
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
958
+ * eventAttrib.color = 0xFFFF0000;
959
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
960
+ * eventAttrib.message.ascii = "Level 0";
961
+ * nvtxRangePushEx(&eventAttrib);
962
+ *
963
+ * // Re-use eventAttrib
964
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE;
965
+ * eventAttrib.message.unicode = L"Level 1";
966
+ * nvtxRangePushEx(&eventAttrib);
967
+ *
968
+ * nvtxRangePop();
969
+ * nvtxRangePop();
970
+ * \endcode
971
+ *
972
+ * \sa
973
+ * ::nvtxDomainRangePushEx
974
+ * ::nvtxRangePop
975
+ *
976
+ * \version \NVTX_VERSION_1
977
+ * @{ */
978
+ NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib);
979
+ /** @} */
980
+
981
+ /* ------------------------------------------------------------------------- */
982
+ /** \brief Starts a nested thread range.
983
+ *
984
+ * \param message - The event message associated to this range event.
985
+ *
986
+ * \return The 0 based level of range being started. If an error occurs a
987
+ * negative value is returned.
988
+ *
989
+ * \par Example:
990
+ * \code
991
+ * nvtxRangePushA("Level 0");
992
+ * nvtxRangePushW(L"Level 1");
993
+ * nvtxRangePop();
994
+ * nvtxRangePop();
995
+ * \endcode
996
+ *
997
+ * \sa
998
+ * ::nvtxDomainRangePushEx
999
+ * ::nvtxRangePop
1000
+ *
1001
+ * \version \NVTX_VERSION_0
1002
+ * @{ */
1003
+ NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message);
1004
+ NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message);
1005
+ /** @} */
1006
+
1007
+
1008
+ /* ------------------------------------------------------------------------- */
1009
+ /** \brief Ends a nested thread range.
1010
+ *
1011
+ * \return The level of the range being ended. If an error occurs a negative
1012
+ * value is returned on the current thread.
1013
+ *
1014
+ * \par Example:
1015
+ * \code
1016
+ * nvtxDomainHandle_t domain = nvtxDomainCreate("example library");
1017
+ * nvtxDomainRangePushA(domain, "Level 0");
1018
+ * nvtxDomainRangePushW(domain, L"Level 1");
1019
+ * nvtxDomainRangePop(domain);
1020
+ * nvtxDomainRangePop(domain);
1021
+ * \endcode
1022
+ *
1023
+ * \sa
1024
+ * ::nvtxRangePushEx
1025
+ * ::nvtxRangePushA
1026
+ * ::nvtxRangePushW
1027
+ *
1028
+ * \version \NVTX_VERSION_2
1029
+ * @{ */
1030
+ NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain);
1031
+ /** @} */
1032
+
1033
+ /* ------------------------------------------------------------------------- */
1034
+ /** \brief Ends a nested thread range.
1035
+ *
1036
+ * \return The level of the range being ended. If an error occurs a negative
1037
+ * value is returned on the current thread.
1038
+ *
1039
+ * \par Example:
1040
+ * \code
1041
+ * nvtxRangePushA("Level 0");
1042
+ * nvtxRangePushW(L"Level 1");
1043
+ * nvtxRangePop();
1044
+ * nvtxRangePop();
1045
+ * \endcode
1046
+ *
1047
+ * \sa
1048
+ * ::nvtxRangePushEx
1049
+ * ::nvtxRangePushA
1050
+ * ::nvtxRangePushW
1051
+ *
1052
+ * \version \NVTX_VERSION_0
1053
+ * @{ */
1054
+ NVTX_DECLSPEC int NVTX_API nvtxRangePop(void);
1055
+ /** @} */
1056
+
1057
+
1058
+ /** @} */ /*END defgroup*/
1059
+ /* ========================================================================= */
1060
+ /** \defgroup RESOURCE_NAMING Resource Naming
1061
+ *
1062
+ * See \ref RESOURCE_NAMING for more details
1063
+ *
1064
+ * @{
1065
+ */
1066
+
1067
+
1068
+ /* ------------------------------------------------------------------------- */
1069
+ /** \name Functions for Generic Resource Naming*/
1070
+ /* ------------------------------------------------------------------------- */
1071
+
1072
+ /* ------------------------------------------------------------------------- */
1073
+ /** \cond SHOW_HIDDEN
1074
+ * \brief Resource typing helpers.
1075
+ *
1076
+ * Classes are used to make it easy to create a series of resource types
1077
+ * per API without collisions
1078
+ */
1079
+ #define NVTX_RESOURCE_MAKE_TYPE(CLASS, INDEX) ((((uint32_t)(NVTX_RESOURCE_CLASS_ ## CLASS))<<16)|((uint32_t)(INDEX)))
1080
+ #define NVTX_RESOURCE_CLASS_GENERIC 1
1081
+ /** \endcond */
1082
+
1083
+ /* ------------------------------------------------------------------------- */
1084
+ /** \brief Generic resource type for when a resource class is not available.
1085
+ *
1086
+ * \sa
1087
+ * ::nvtxDomainResourceCreate
1088
+ *
1089
+ * \version \NVTX_VERSION_2
1090
+ */
1091
+ typedef enum nvtxResourceGenericType_t
1092
+ {
1093
+ NVTX_RESOURCE_TYPE_UNKNOWN = 0,
1094
+ NVTX_RESOURCE_TYPE_GENERIC_POINTER = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 1), /**< Generic pointer assumed to have no collisions with other pointers. */
1095
+ NVTX_RESOURCE_TYPE_GENERIC_HANDLE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 2), /**< Generic handle assumed to have no collisions with other handles. */
1096
+ NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 3), /**< OS native thread identifier. */
1097
+ NVTX_RESOURCE_TYPE_GENERIC_THREAD_POSIX = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 4) /**< POSIX pthread identifier. */
1098
+ } nvtxResourceGenericType_t;
1099
+
1100
+
1101
+
1102
+ /** \brief Resource Attribute Structure.
1103
+ * \anchor RESOURCE_ATTRIBUTE_STRUCTURE
1104
+ *
1105
+ * This structure is used to describe the attributes of a resource. The layout of
1106
+ * the structure is defined by a specific version of the tools extension
1107
+ * library and can change between different versions of the Tools Extension
1108
+ * library.
1109
+ *
1110
+ * \par Initializing the Attributes
1111
+ *
1112
+ * The caller should always perform the following three tasks when using
1113
+ * attributes:
1114
+ * <ul>
1115
+ * <li>Zero the structure
1116
+ * <li>Set the version field
1117
+ * <li>Set the size field
1118
+ * </ul>
1119
+ *
1120
+ * Zeroing the structure sets all the resource attributes types and values
1121
+ * to the default value.
1122
+ *
1123
+ * The version and size field are used by the Tools Extension
1124
+ * implementation to handle multiple versions of the attributes structure.
1125
+ *
1126
+ * It is recommended that the caller use one of the following to methods
1127
+ * to initialize the event attributes structure:
1128
+ *
1129
+ * \par Method 1: Initializing nvtxEventAttributes for future compatibility
1130
+ * \code
1131
+ * nvtxResourceAttributes_t attribs = {0};
1132
+ * attribs.version = NVTX_VERSION;
1133
+ * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
1134
+ * \endcode
1135
+ *
1136
+ * \par Method 2: Initializing nvtxEventAttributes for a specific version
1137
+ * \code
1138
+ * nvtxResourceAttributes_v0 attribs = {0};
1139
+ * attribs.version = 2;
1140
+ * attribs.size = (uint16_t)(sizeof(nvtxResourceAttributes_v0));
1141
+ * \endcode
1142
+ *
1143
+ * If the caller uses Method 1 it is critical that the entire binary
1144
+ * layout of the structure be configured to 0 so that all fields
1145
+ * are initialized to the default value.
1146
+ *
1147
+ * The caller should either use both NVTX_VERSION and
1148
+ * NVTX_RESOURCE_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
1149
+ * and a versioned type (Method 2). Using a mix of the two methods
1150
+ * will likely cause either source level incompatibility or binary
1151
+ * incompatibility in the future.
1152
+ *
1153
+ * \par Settings Attribute Types and Values
1154
+ *
1155
+ *
1156
+ * \par Example:
1157
+ * \code
1158
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
1159
+ *
1160
+ * // Initialize
1161
+ * nvtxResourceAttributes_t attribs = {0};
1162
+ * attribs.version = NVTX_VERSION;
1163
+ * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
1164
+ *
1165
+ * // Configure the Attributes
1166
+ * attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
1167
+ * attribs.identifier.pValue = (const void*)pMutex;
1168
+ * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
1169
+ * attribs.message.ascii = "Single thread access to database.";
1170
+ *
1171
+ * nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
1172
+ * \endcode
1173
+ *
1174
+ * \sa
1175
+ * ::nvtxDomainResourceCreate
1176
+ */
1177
+ typedef struct nvtxResourceAttributes_v0
1178
+ {
1179
+ /**
1180
+ * \brief Version flag of the structure.
1181
+ *
1182
+ * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
1183
+ * supported in this header file. This can optionally be overridden to
1184
+ * another version of the tools extension library.
1185
+ */
1186
+ uint16_t version;
1187
+
1188
+ /**
1189
+ * \brief Size of the structure.
1190
+ *
1191
+ * Needs to be set to the size in bytes of this attribute
1192
+ * structure.
1193
+ */
1194
+ uint16_t size;
1195
+
1196
+ /**
1197
+ * \brief Identifier type specifies how to interpret the identifier field
1198
+ *
1199
+ * Defines the identifier format of the attribute structure's \ref RESOURCE_IDENTIFIER_FIELD
1200
+ * "identifier" field.
1201
+ *
1202
+ * Default Value is NVTX_RESOURCE_TYPE_UNKNOWN
1203
+ */
1204
+ int32_t identifierType; /* values from enums following the pattern nvtxResource[name]Type_t */
1205
+
1206
+ /**
1207
+ * \brief Identifier for the resource.
1208
+ * \anchor RESOURCE_IDENTIFIER_FIELD
1209
+ *
1210
+ * An identifier may be a pointer or a handle to an OS or middleware API object.
1211
+ * The resource type will assist in avoiding collisions where handles values may collide.
1212
+ */
1213
+ union identifier_t
1214
+ {
1215
+ const void* pValue;
1216
+ uint64_t ullValue;
1217
+ } identifier;
1218
+
1219
+ /** \brief Message type specified in this attribute structure.
1220
+ *
1221
+ * Defines the message format of the attribute structure's \ref RESOURCE_MESSAGE_FIELD
1222
+ * "message" field.
1223
+ *
1224
+ * Default Value is NVTX_MESSAGE_UNKNOWN
1225
+ */
1226
+ int32_t messageType; /* nvtxMessageType_t */
1227
+
1228
+ /** \brief Message assigned to this attribute structure. \anchor RESOURCE_MESSAGE_FIELD
1229
+ *
1230
+ * The text message that is attached to a resource.
1231
+ */
1232
+ nvtxMessageValue_t message;
1233
+
1234
+ } nvtxResourceAttributes_v0;
1235
+
1236
+ typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t;
1237
+
1238
+ /* \cond SHOW_HIDDEN
1239
+ * \version \NVTX_VERSION_2
1240
+ */
1241
+ #define NVTX_RESOURCE_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxResourceAttributes_v0) ) )
1242
+ typedef struct nvtxResourceHandle* nvtxResourceHandle_t;
1243
+ /** \endcond */
1244
+
1245
+
1246
+
1247
+ /* ------------------------------------------------------------------------- */
1248
+ /** \brief Create a resource object to track and associate data with OS and middleware objects
1249
+ *
1250
+ * Allows users to associate an API handle or pointer with a user-provided name.
1251
+ *
1252
+ *
1253
+ * \param domain - Domain to own the resource object
1254
+ * \param attribs - Attributes to be associated with the resource
1255
+ *
1256
+ * \return A handle that represents the newly created resource object.
1257
+ *
1258
+ * \par Example:
1259
+ * \code
1260
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
1261
+ * nvtxResourceAttributes_t attribs = {0};
1262
+ * attribs.version = NVTX_VERSION;
1263
+ * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
1264
+ * attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
1265
+ * attribs.identifier.pValue = (const void*)pMutex;
1266
+ * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
1267
+ * attribs.message.ascii = "Single thread access to database.";
1268
+ * nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
1269
+ * \endcode
1270
+ *
1271
+ * \sa
1272
+ * ::nvtxResourceAttributes_t
1273
+ * ::nvtxDomainResourceDestroy
1274
+ *
1275
+ * \version \NVTX_VERSION_2
1276
+ * @{ */
1277
+ NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
1278
+ /** @} */
1279
+
1280
+ /* ------------------------------------------------------------------------- */
1281
+ /** \brief Destroy a resource object to track and associate data with OS and middleware objects
1282
+ *
1283
+ * Allows users to associate an API handle or pointer with a user-provided name.
1284
+ *
1285
+ * \param resource - Handle to the resource in which to operate.
1286
+ *
1287
+ * \par Example:
1288
+ * \code
1289
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
1290
+ * nvtxResourceAttributes_t attribs = {0};
1291
+ * attribs.version = NVTX_VERSION;
1292
+ * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
1293
+ * attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
1294
+ * attribs.identifier.pValue = (const void*)pMutex;
1295
+ * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
1296
+ * attribs.message.ascii = "Single thread access to database.";
1297
+ * nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
1298
+ * nvtxDomainResourceDestroy(handle);
1299
+ * \endcode
1300
+ *
1301
+ * \sa
1302
+ * ::nvtxDomainResourceCreate
1303
+ *
1304
+ * \version \NVTX_VERSION_2
1305
+ * @{ */
1306
+ NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource);
1307
+ /** @} */
1308
+
1309
+
1310
+ /** \name Functions for NVTX Category Naming*/
1311
+
1312
+ /* ------------------------------------------------------------------------- */
1313
+ /**
1314
+ * \brief Annotate an NVTX category used within a domain.
1315
+ *
1316
+ * Categories are used to group sets of events. Each category is identified
1317
+ * through a unique ID and that ID is passed into any of the marker/range
1318
+ * events to assign that event to a specific category. The nvtxDomainNameCategory
1319
+ * function calls allow the user to assign a name to a category ID that is
1320
+ * specific to the domain.
1321
+ *
1322
+ * nvtxDomainNameCategory(NULL, category, name) is equivalent to calling
1323
+ * nvtxNameCategory(category, name).
1324
+ *
1325
+ * \param domain - The domain of scoping the category.
1326
+ * \param category - The category ID to name.
1327
+ * \param name - The name of the category.
1328
+ *
1329
+ * \remarks The category names are tracked per domain.
1330
+ *
1331
+ * \par Example:
1332
+ * \code
1333
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("example");
1334
+ * nvtxDomainNameCategoryA(domain, 1, "Memory Allocation");
1335
+ * nvtxDomainNameCategoryW(domain, 2, L"Memory Transfer");
1336
+ * \endcode
1337
+ *
1338
+ * \version \NVTX_VERSION_2
1339
+ * @{ */
1340
+ NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name);
1341
+ NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
1342
+ /** @} */
1343
+
1344
+ /** \brief Annotate an NVTX category.
1345
+ *
1346
+ * Categories are used to group sets of events. Each category is identified
1347
+ * through a unique ID and that ID is passed into any of the marker/range
1348
+ * events to assign that event to a specific category. The nvtxNameCategory
1349
+ * function calls allow the user to assign a name to a category ID.
1350
+ *
1351
+ * \param category - The category ID to name.
1352
+ * \param name - The name of the category.
1353
+ *
1354
+ * \remarks The category names are tracked per process.
1355
+ *
1356
+ * \par Example:
1357
+ * \code
1358
+ * nvtxNameCategory(1, "Memory Allocation");
1359
+ * nvtxNameCategory(2, "Memory Transfer");
1360
+ * nvtxNameCategory(3, "Memory Object Lifetime");
1361
+ * \endcode
1362
+ *
1363
+ * \version \NVTX_VERSION_1
1364
+ * @{ */
1365
+ NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name);
1366
+ NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name);
1367
+ /** @} */
1368
+
1369
+ /** \name Functions for OS Threads Naming*/
1370
+
1371
+ /* ------------------------------------------------------------------------- */
1372
+ /** \brief Annotate an OS thread.
1373
+ *
1374
+ * Allows the user to name an active thread of the current process. If an
1375
+ * invalid thread ID is provided or a thread ID from a different process is
1376
+ * used the behavior of the tool is implementation dependent.
1377
+ *
1378
+ * The thread name is associated to the default domain. To support domains
1379
+ * use resource objects via ::nvtxDomainResourceCreate.
1380
+ *
1381
+ * \param threadId - The ID of the thread to name.
1382
+ * \param name - The name of the thread.
1383
+ *
1384
+ * \par Example:
1385
+ * \code
1386
+ * nvtxNameOsThread(GetCurrentThreadId(), "MAIN_THREAD");
1387
+ * \endcode
1388
+ *
1389
+ * \version \NVTX_VERSION_1
1390
+ * @{ */
1391
+ NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name);
1392
+ NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name);
1393
+ /** @} */
1394
+
1395
+
1396
+ /** @} */ /*END defgroup*/
1397
+ /* ========================================================================= */
1398
+ /** \defgroup STRING_REGISTRATION String Registration
1399
+ *
1400
+ * Registered strings are intended to increase performance by lowering instrumentation
1401
+ * overhead. String may be registered once and the handle may be passed in place of
1402
+ * a string where an the APIs may allow.
1403
+ *
1404
+ * See \ref STRING_REGISTRATION for more details
1405
+ *
1406
+ * @{
1407
+ */
1408
+
1409
+ /* ------------------------------------------------------------------------- */
1410
+ /** \brief Register a string.
1411
+
1412
+ * Registers an immutable string with NVTX. Once registered the pointer used
1413
+ * to register the domain name can be used in nvtxEventAttributes_t
1414
+ * \ref MESSAGE_FIELD. This allows NVTX implementation to skip copying the
1415
+ * contents of the message on each event invocation.
1416
+ *
1417
+ * String registration is an optimization. It is recommended to use string
1418
+ * registration if the string will be passed to an event many times.
1419
+ *
1420
+ * String are not unregistered, except that by unregistering the entire domain
1421
+ *
1422
+ * \param domain - Domain handle. If NULL then the global domain is used.
1423
+ * \param string - A unique pointer to a sequence of characters.
1424
+ *
1425
+ * \return A handle representing the registered string.
1426
+ *
1427
+ * \par Example:
1428
+ * \code
1429
+ * nvtxDomainCreateA("com.nvidia.nvtx.example");
1430
+ * nvtxStringHandle_t message = nvtxDomainRegisterStringA(domain, "registered string");
1431
+ * nvtxEventAttributes_t eventAttrib = {0};
1432
+ * eventAttrib.version = NVTX_VERSION;
1433
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
1434
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
1435
+ * eventAttrib.message.registered = message;
1436
+ * \endcode
1437
+ *
1438
+ * \version \NVTX_VERSION_2
1439
+ * @{ */
1440
+ NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string);
1441
+ NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string);
1442
+ /** @} */
1443
+
1444
+ /** @} */ /*END defgroup*/
1445
+ /* ========================================================================= */
1446
+ /** \defgroup DOMAINS Domains
1447
+ *
1448
+ * Domains are used to group events to a developer defined scope. Middleware
1449
+ * vendors may also scope their own events to avoid collisions with the
1450
+ * the application developer's events, so that the application developer may
1451
+ * inspect both parts and easily differentiate or filter them. By default
1452
+ * all events are scoped to a global domain where NULL is provided or when
1453
+ * using APIs provided b versions of NVTX below v2
1454
+ *
1455
+ * Domains are intended to be typically long lived objects with the intention
1456
+ * of logically separating events of large modules from each other such as
1457
+ * middleware libraries from each other and the main application.
1458
+ *
1459
+ * See \ref DOMAINS for more details
1460
+ *
1461
+ * @{
1462
+ */
1463
+
1464
+ /* ------------------------------------------------------------------------- */
1465
+ /** \brief Register a NVTX domain.
1466
+ *
1467
+ * Domains are used to scope annotations. All NVTX_VERSION_0 and NVTX_VERSION_1
1468
+ * annotations are scoped to the global domain. The function nvtxDomainCreate
1469
+ * creates a new named domain.
1470
+ *
1471
+ * Each domain maintains its own nvtxRangePush and nvtxRangePop stack.
1472
+ *
1473
+ * \param name - A unique string representing the domain.
1474
+ *
1475
+ * \return A handle representing the domain.
1476
+ *
1477
+ * \par Example:
1478
+ * \code
1479
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example");
1480
+ *
1481
+ * nvtxMarkA("nvtxMarkA to global domain");
1482
+ *
1483
+ * nvtxEventAttributes_t eventAttrib1 = {0};
1484
+ * eventAttrib1.version = NVTX_VERSION;
1485
+ * eventAttrib1.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
1486
+ * eventAttrib1.message.ascii = "nvtxDomainMarkEx to global domain";
1487
+ * nvtxDomainMarkEx(NULL, &eventAttrib1);
1488
+ *
1489
+ * nvtxEventAttributes_t eventAttrib2 = {0};
1490
+ * eventAttrib2.version = NVTX_VERSION;
1491
+ * eventAttrib2.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
1492
+ * eventAttrib2.message.ascii = "nvtxDomainMarkEx to com.nvidia.nvtx.example";
1493
+ * nvtxDomainMarkEx(domain, &eventAttrib2);
1494
+ * nvtxDomainDestroy(domain);
1495
+ * \endcode
1496
+ *
1497
+ * \sa
1498
+ * ::nvtxDomainDestroy
1499
+ *
1500
+ * \version \NVTX_VERSION_2
1501
+ * @{ */
1502
+ NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* name);
1503
+ NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* name);
1504
+ /** @} */
1505
+
1506
+ /* ------------------------------------------------------------------------- */
1507
+ /** \brief Unregister a NVTX domain.
1508
+ *
1509
+ * Unregisters the domain handle and frees all domain specific resources.
1510
+ *
1511
+ * \param domain - the domain handle
1512
+ *
1513
+ * \par Example:
1514
+ * \code
1515
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example");
1516
+ * nvtxDomainDestroy(domain);
1517
+ * \endcode
1518
+ *
1519
+ * \sa
1520
+ * ::nvtxDomainCreateA
1521
+ * ::nvtxDomainCreateW
1522
+ *
1523
+ * \version \NVTX_VERSION_2
1524
+ * @{ */
1525
+ NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain);
1526
+ /** @} */
1527
+
1528
+
1529
+ /** @} */ /*END defgroup*/
1530
+ /* ========================================================================= */
1531
+ /** \cond SHOW_HIDDEN */
1532
+
1533
+ #ifdef UNICODE
1534
+ #define nvtxMark nvtxMarkW
1535
+ #define nvtxRangeStart nvtxRangeStartW
1536
+ #define nvtxRangePush nvtxRangePushW
1537
+ #define nvtxNameCategory nvtxNameCategoryW
1538
+ #define nvtxNameOsThread nvtxNameOsThreadW
1539
+ /* NVTX_VERSION_2 */
1540
+ #define nvtxDomainCreate nvtxDomainCreateW
1541
+ #define nvtxDomainRegisterString nvtxDomainRegisterStringW
1542
+ #define nvtxDomainNameCategory nvtxDomainNameCategoryW
1543
+ #else
1544
+ #define nvtxMark nvtxMarkA
1545
+ #define nvtxRangeStart nvtxRangeStartA
1546
+ #define nvtxRangePush nvtxRangePushA
1547
+ #define nvtxNameCategory nvtxNameCategoryA
1548
+ #define nvtxNameOsThread nvtxNameOsThreadA
1549
+ /* NVTX_VERSION_2 */
1550
+ #define nvtxDomainCreate nvtxDomainCreateA
1551
+ #define nvtxDomainRegisterString nvtxDomainRegisterStringA
1552
+ #define nvtxDomainNameCategory nvtxDomainNameCategoryA
1553
+ #endif
1554
+
1555
+ /** \endcond */
1556
+
1557
+ #ifdef __cplusplus
1558
+ }
1559
+ #endif /* __cplusplus */
1560
+
1561
+ #endif /* NVTOOLSEXT_H_ */
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCuda.h ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ #ifndef NVTOOLSEXT_CUDA_H_
39
+ #define NVTOOLSEXT_CUDA_H_
40
+
41
+ #include "cuda.h"
42
+
43
+ #include "nvToolsExt.h"
44
+
45
+ #ifdef __cplusplus
46
+ extern "C" {
47
+ #endif /* __cplusplus */
48
+
49
+ /* ========================================================================= */
50
+ /** \name Functions for CUDA Resource Naming
51
+ */
52
+ /** \addtogroup RESOURCE_NAMING
53
+ * \section RESOURCE_NAMING_CUDA CUDA Resource Naming
54
+ *
55
+ * This section covers the API functions that allow to annotate CUDA resources
56
+ * with user-provided names.
57
+ *
58
+ * @{
59
+ */
60
+
61
+ /* ------------------------------------------------------------------------- */
62
+ /* \cond SHOW_HIDDEN
63
+ * \brief Used to build a non-colliding value for resource types separated class
64
+ * \version \NVTX_VERSION_2
65
+ */
66
+ #define NVTX_RESOURCE_CLASS_CUDA 4
67
+ /** \endcond */
68
+
69
+ /* ------------------------------------------------------------------------- */
70
+ /** \brief Resource types for CUDA
71
+ */
72
+ typedef enum nvtxResourceCUDAType_t
73
+ {
74
+ NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
75
+ NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
76
+ NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
77
+ NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4) /* CUevent */
78
+ } nvtxResourceCUDAType_t;
79
+
80
+
81
+ /* ------------------------------------------------------------------------- */
82
+ /** \brief Annotates a CUDA device.
83
+ *
84
+ * Allows the user to associate a CUDA device with a user-provided name.
85
+ *
86
+ * \param device - The handle of the CUDA device to name.
87
+ * \param name - The name of the CUDA device.
88
+ *
89
+ * \version \NVTX_VERSION_1
90
+ * @{ */
91
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
92
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
93
+ /** @} */
94
+
95
+ /* ------------------------------------------------------------------------- */
96
+ /** \brief Annotates a CUDA context.
97
+ *
98
+ * Allows the user to associate a CUDA context with a user-provided name.
99
+ *
100
+ * \param context - The handle of the CUDA context to name.
101
+ * \param name - The name of the CUDA context.
102
+ *
103
+ * \par Example:
104
+ * \code
105
+ * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
106
+ * if ( CUDA_SUCCESS != status )
107
+ * goto Error;
108
+ * nvtxNameCuContext(cuContext, "CTX_NAME");
109
+ * \endcode
110
+ *
111
+ * \version \NVTX_VERSION_1
112
+ * @{ */
113
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
114
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
115
+ /** @} */
116
+
117
+ /* ------------------------------------------------------------------------- */
118
+ /** \brief Annotates a CUDA stream.
119
+ *
120
+ * Allows the user to associate a CUDA stream with a user-provided name.
121
+ *
122
+ * \param stream - The handle of the CUDA stream to name.
123
+ * \param name - The name of the CUDA stream.
124
+ *
125
+ * \version \NVTX_VERSION_1
126
+ * @{ */
127
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
128
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
129
+ /** @} */
130
+
131
+ /* ------------------------------------------------------------------------- */
132
+ /** \brief Annotates a CUDA event.
133
+ *
134
+ * Allows the user to associate a CUDA event with a user-provided name.
135
+ *
136
+ * \param event - The handle of the CUDA event to name.
137
+ * \param name - The name of the CUDA event.
138
+ *
139
+ * \version \NVTX_VERSION_1
140
+ * @{ */
141
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
142
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
143
+ /** @} */
144
+
145
+ /** @} */ /* END RESOURCE_NAMING */
146
+
147
+ /* ========================================================================= */
148
+ #ifdef UNICODE
149
+ #define nvtxNameCuDevice nvtxNameCuDeviceW
150
+ #define nvtxNameCuContext nvtxNameCuContextW
151
+ #define nvtxNameCuStream nvtxNameCuStreamW
152
+ #define nvtxNameCuEvent nvtxNameCuEventW
153
+ #else
154
+ #define nvtxNameCuDevice nvtxNameCuDeviceA
155
+ #define nvtxNameCuContext nvtxNameCuContextA
156
+ #define nvtxNameCuStream nvtxNameCuStreamA
157
+ #define nvtxNameCuEvent nvtxNameCuEventA
158
+ #endif
159
+
160
+ #ifdef __cplusplus
161
+ }
162
+ #endif /* __cplusplus */
163
+
164
+ #endif /* NVTOOLSEXT_CUDA_H_ */
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCudaRt.h ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ #ifndef NVTOOLSEXT_CUDART_H_
39
+ #define NVTOOLSEXT_CUDART_H_
40
+
41
+ #include "cuda.h"
42
+ #include "driver_types.h"
43
+
44
+ #include "nvToolsExt.h"
45
+
46
+ #ifdef __cplusplus
47
+ extern "C" {
48
+ #endif /* __cplusplus */
49
+
50
+ /* ========================================================================= */
51
+ /** \name Functions for CUDA Resource Naming
52
+ */
53
+ /** \addtogroup RESOURCE_NAMING
54
+ * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
55
+ *
56
+ * This section covers the API functions that allow to annotate CUDA resources
57
+ * with user-provided names.
58
+ *
59
+ * @{
60
+ */
61
+
62
+ /* ------------------------------------------------------------------------- */
63
+ /* \cond SHOW_HIDDEN
64
+ * \brief Used to build a non-colliding value for resource types separated class
65
+ * \version \NVTX_VERSION_2
66
+ */
67
+ #define NVTX_RESOURCE_CLASS_CUDART 5
68
+ /** \endcond */
69
+
70
+ /* ------------------------------------------------------------------------- */
71
+ /** \brief Resource types for CUDART
72
+ */
73
+ typedef enum nvtxResourceCUDARTType_t
74
+ {
75
+ NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
76
+ NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
77
+ NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2) /* cudaEvent_t */
78
+ } nvtxResourceCUDARTType_t;
79
+
80
+
81
+ /* ------------------------------------------------------------------------- */
82
+ /** \brief Annotates a CUDA device.
83
+ *
84
+ * Allows the user to associate a CUDA device with a user-provided name.
85
+ *
86
+ * \param device - The id of the CUDA device to name.
87
+ * \param name - The name of the CUDA device.
88
+ *
89
+ * \version \NVTX_VERSION_1
90
+ * @{ */
91
+ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
92
+ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
93
+ /** @} */
94
+
95
+ /* ------------------------------------------------------------------------- */
96
+ /** \brief Annotates a CUDA stream.
97
+ *
98
+ * Allows the user to associate a CUDA stream with a user-provided name.
99
+ *
100
+ * \param stream - The handle of the CUDA stream to name.
101
+ * \param name - The name of the CUDA stream.
102
+ *
103
+ * \version \NVTX_VERSION_1
104
+ * @{ */
105
+ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
106
+ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
107
+ /** @} */
108
+
109
+ /* ------------------------------------------------------------------------- */
110
+ /** \brief Annotates a CUDA event.
111
+ *
112
+ * Allows the user to associate a CUDA event with a user-provided name.
113
+ *
114
+ * \param event - The handle of the CUDA event to name.
115
+ * \param name - The name of the CUDA event.
116
+ *
117
+ * \version \NVTX_VERSION_1
118
+ * @{ */
119
+ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
120
+ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
121
+ /** @} */
122
+
123
+ /** @} */ /* END RESOURCE_NAMING */
124
+
125
+ /* ========================================================================= */
126
+ #ifdef UNICODE
127
+ #define nvtxNameCudaDevice nvtxNameCudaDeviceW
128
+ #define nvtxNameCudaStream nvtxNameCudaStreamW
129
+ #define nvtxNameCudaEvent nvtxNameCudaEventW
130
+ #else
131
+ #define nvtxNameCudaDevice nvtxNameCudaDeviceA
132
+ #define nvtxNameCudaStream nvtxNameCudaStreamA
133
+ #define nvtxNameCudaEvent nvtxNameCudaEventA
134
+ #endif
135
+
136
+ #ifdef __cplusplus
137
+ }
138
+ #endif /* __cplusplus */
139
+
140
+ #endif /* NVTOOLSEXT_CUDART_H_ */
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtOpenCL.h ADDED
@@ -0,0 +1,214 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ #ifndef NVTOOLSEXT_OPENCL_H_
39
+ #define NVTOOLSEXT_OPENCL_H_
40
+
41
+ #include <CL/cl.h>
42
+
43
+ #include "nvToolsExt.h"
44
+
45
+ #ifdef __cplusplus
46
+ extern "C" {
47
+ #endif /* __cplusplus */
48
+
49
+ /* ========================================================================= */
50
+ /** \name Functions for OpenCL Resource Naming
51
+ */
52
+ /** \addtogroup RESOURCE_NAMING
53
+ * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
54
+ *
55
+ * This section covers the API functions that allow to annotate OpenCL resources
56
+ * with user-provided names.
57
+ *
58
+ * @{
59
+ */
60
+
61
+ /* ------------------------------------------------------------------------- */
62
+ /* \cond SHOW_HIDDEN
63
+ * \brief Used to build a non-colliding value for resource types separated class
64
+ * \version \NVTX_VERSION_2
65
+ */
66
+ #define NVTX_RESOURCE_CLASS_OPENCL 6
67
+ /** \endcond */
68
+
69
+ /* ------------------------------------------------------------------------- */
70
+ /** \brief Resource types for OpenCL
71
+ */
72
+ typedef enum nvtxResourceOpenCLType_t
73
+ {
74
+ NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
75
+ NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
76
+ NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
77
+ NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
78
+ NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
79
+ NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
80
+ NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7)
81
+ } nvtxResourceOpenCLType_t;
82
+
83
+
84
+ /* ------------------------------------------------------------------------- */
85
+ /** \brief Annotates an OpenCL device.
86
+ *
87
+ * Allows to associate an OpenCL device with a user-provided name.
88
+ *
89
+ * \param device - The handle of the OpenCL device to name.
90
+ * \param name - The name of the OpenCL device.
91
+ *
92
+ * \version \NVTX_VERSION_1
93
+ * @{ */
94
+ NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
95
+ NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
96
+ /** @} */
97
+
98
+ /* ------------------------------------------------------------------------- */
99
+ /** \brief Annotates an OpenCL context.
100
+ *
101
+ * Allows to associate an OpenCL context with a user-provided name.
102
+ *
103
+ * \param context - The handle of the OpenCL context to name.
104
+ * \param name - The name of the OpenCL context.
105
+ *
106
+ * \version \NVTX_VERSION_1
107
+ * @{ */
108
+ NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
109
+ NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
110
+ /** @} */
111
+
112
+ /* ------------------------------------------------------------------------- */
113
+ /** \brief Annotates an OpenCL command queue.
114
+ *
115
+ * Allows to associate an OpenCL command queue with a user-provided name.
116
+ *
117
+ * \param command_queue - The handle of the OpenCL command queue to name.
118
+ * \param name - The name of the OpenCL command queue.
119
+ *
120
+ * \version \NVTX_VERSION_1
121
+ * @{ */
122
+ NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
123
+ NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
124
+ /** @} */
125
+
126
+ /* ------------------------------------------------------------------------- */
127
+ /** \brief Annotates an OpenCL memory object.
128
+ *
129
+ * Allows to associate an OpenCL memory object with a user-provided name.
130
+ *
131
+ * \param memobj - The handle of the OpenCL memory object to name.
132
+ * \param name - The name of the OpenCL memory object.
133
+ *
134
+ * \version \NVTX_VERSION_1
135
+ * @{ */
136
+ NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
137
+ NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
138
+ /** @} */
139
+
140
+ /* ------------------------------------------------------------------------- */
141
+ /** \brief Annotates an OpenCL sampler.
142
+ *
143
+ * Allows to associate an OpenCL sampler with a user-provided name.
144
+ *
145
+ * \param sampler - The handle of the OpenCL sampler to name.
146
+ * \param name - The name of the OpenCL sampler.
147
+ *
148
+ * \version \NVTX_VERSION_1
149
+ * @{ */
150
+ NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
151
+ NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
152
+ /** @} */
153
+
154
+ /* ------------------------------------------------------------------------- */
155
+ /** \brief Annotates an OpenCL program.
156
+ *
157
+ * Allows to associate an OpenCL program with a user-provided name.
158
+ *
159
+ * \param program - The handle of the OpenCL program to name.
160
+ * \param name - The name of the OpenCL program.
161
+ *
162
+ * \code
163
+ * cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
164
+ * (const char **) &cSourceCL, &program_length, &ciErrNum);
165
+ * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
166
+ * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
167
+ * \endcode
168
+ *
169
+ * \version \NVTX_VERSION_1
170
+ * @{ */
171
+ NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
172
+ NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
173
+ /** @} */
174
+
175
+ /* ------------------------------------------------------------------------- */
176
+ /** \brief Annotates an OpenCL event.
177
+ *
178
+ * Allows to associate an OpenCL event with a user-provided name.
179
+ *
180
+ * \param evnt - The handle of the OpenCL event to name.
181
+ * \param name - The name of the OpenCL event.
182
+ *
183
+ * \version \NVTX_VERSION_1
184
+ * @{ */
185
+ NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
186
+ NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
187
+ /** @} */
188
+
189
+ /** @} */ /* END RESOURCE_NAMING */
190
+
191
+ /* ========================================================================= */
192
+ #ifdef UNICODE
193
+ #define nvtxNameClDevice nvtxNameClDeviceW
194
+ #define nvtxNameClContext nvtxNameClContextW
195
+ #define nvtxNameClCommandQueue nvtxNameClCommandQueueW
196
+ #define nvtxNameClMemObject nvtxNameClMemObjectW
197
+ #define nvtxNameClSampler nvtxNameClSamplerW
198
+ #define nvtxNameClProgram nvtxNameClProgramW
199
+ #define nvtxNameClEvent nvtxNameClEventW
200
+ #else
201
+ #define nvtxNameClDevice nvtxNameClDeviceA
202
+ #define nvtxNameClContext nvtxNameClContextA
203
+ #define nvtxNameClCommandQueue nvtxNameClCommandQueueA
204
+ #define nvtxNameClMemObject nvtxNameClMemObjectA
205
+ #define nvtxNameClSampler nvtxNameClSamplerA
206
+ #define nvtxNameClProgram nvtxNameClProgramA
207
+ #define nvtxNameClEvent nvtxNameClEventA
208
+ #endif
209
+
210
+ #ifdef __cplusplus
211
+ }
212
+ #endif /* __cplusplus */
213
+
214
+ #endif /* NVTOOLSEXT_OPENCL_H_ */
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtSync.h ADDED
@@ -0,0 +1,406 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ #ifndef NVTOOLSEXT_SYNC_H_
39
+ #define NVTOOLSEXT_SYNC_H_
40
+
41
+ #include "nvToolsExt.h"
42
+
43
+
44
+ #ifdef __cplusplus
45
+ extern "C" {
46
+ #endif /* __cplusplus */
47
+
48
+ /* \cond SHOW_HIDDEN
49
+ * \version \NVTX_VERSION_2
50
+ */
51
+ #define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
52
+ /** \endcond */
53
+
54
+
55
+ /**
56
+ * \page PAGE_SYNCHRONIZATION Synchronization
57
+ *
58
+ * This section covers a subset of the API that allow users to track additional
59
+ * synchronization details of their application. Naming OS synchronization primitives
60
+ * may allow users to better understand the data collected by traced synchronization
61
+ * APIs. Additionally, a user defined synchronization object can allow the users to
62
+ * to tell the tools when the user is building their own synchronization system
63
+ * that do not rely on the OS to provide behaviors and instead use techniques like
64
+ * atomic operations and spinlocks.
65
+ *
66
+ * See module \ref SYNCHRONIZATION for details.
67
+ *
68
+ * \par Example:
69
+ * \code
70
+ * class MyMutex
71
+ * {
72
+ * volatile long bLocked;
73
+ * nvtxSyncUser_t hSync;
74
+ * public:
75
+ * MyMutex(const char* name, nvtxDomainHandle_t d){
76
+ * bLocked = 0;
77
+ *
78
+ * nvtxSyncUserAttributes_t attribs = { 0 };
79
+ * attribs.version = NVTX_VERSION;
80
+ * attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
81
+ * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
82
+ * attribs.message.ascii = name;
83
+ * hSync = nvtxDomainSyncUserCreate(d, &attribs);
84
+ * }
85
+ *
86
+ * ~MyMutex() {
87
+ * nvtxDomainSyncUserDestroy(hSync);
88
+ * }
89
+ *
90
+ * bool Lock() {
91
+ * nvtxDomainSyncUserAcquireStart(hSync);
92
+ * bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
93
+
94
+ * if (acquired) {
95
+ * nvtxDomainSyncUserAcquireSuccess(hSync);
96
+ * }
97
+ * else {
98
+ * nvtxDomainSyncUserAcquireFailed(hSync);
99
+ * }
100
+ * return acquired;
101
+ * }
102
+
103
+ * void Unlock() {
104
+ * nvtxDomainSyncUserReleasing(hSync);
105
+ * bLocked = false;
106
+ * }
107
+ * };
108
+ * \endcode
109
+ *
110
+ * \version \NVTX_VERSION_2
111
+ */
112
+
113
+ /* ------------------------------------------------------------------------- */
114
+ /* \cond SHOW_HIDDEN
115
+ * \brief Used to build a non-colliding value for resource types separated class
116
+ * \version \NVTX_VERSION_2
117
+ */
118
+ #define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */
119
+ #define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/
120
+ /** \endcond */
121
+
122
+
123
+ /* ------------------------------------------------------------------------- */
124
+ /** \defgroup SYNCHRONIZATION Synchronization
125
+ * See page \ref PAGE_SYNCHRONIZATION.
126
+ * @{
127
+ */
128
+
129
+ /** \brief Resource type values for OSs with POSIX Thread API support
130
+ */
131
+ typedef enum nvtxResourceSyncPosixThreadType_t
132
+ {
133
+ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t */
134
+ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t */
135
+ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t */
136
+ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t */
137
+ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t */
138
+ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t */
139
+ } nvtxResourceSyncPosixThreadType_t;
140
+
141
+ /** \brief Resource type values for Windows OSs
142
+ */
143
+ typedef enum nvtxResourceSyncWindowsType_t
144
+ {
145
+ NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
146
+ NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
147
+ NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
148
+ NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
149
+ NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5)
150
+ } nvtxResourceSyncWindowsType_t;
151
+
152
+ /** \brief Resource type values for Linux and Linux derived OSs such as Android
153
+ * \sa
154
+ * ::nvtxResourceSyncPosixThreadType_t
155
+ */
156
+ typedef enum nvtxResourceSyncLinuxType_t
157
+ {
158
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
159
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
160
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
161
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
162
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5),
163
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6),
164
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7)
165
+ } nvtxResourceSyncLinuxType_t;
166
+
167
+ /** \brief Resource type values for Android come from Linux.
168
+ * \sa
169
+ * ::nvtxResourceSyncLinuxType_t
170
+ * ::nvtxResourceSyncPosixThreadType_t
171
+ */
172
+ typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t;
173
+
174
+ /** \brief User Defined Synchronization Object Handle .
175
+ * \anchor SYNCUSER_HANDLE_STRUCTURE
176
+ *
177
+ * This structure is opaque to the user and is used as a handle to reference
178
+ * a user defined syncrhonization object. The tools will return a pointer through the API for the application
179
+ * to hold on it's behalf to reference the string in the future.
180
+ *
181
+ */
182
+ typedef struct nvtxSyncUser* nvtxSyncUser_t;
183
+
184
+ /** \brief User Defined Synchronization Object Attributes Structure.
185
+ * \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
186
+ *
187
+ * This structure is used to describe the attributes of a user defined synchronization
188
+ * object. The layout of the structure is defined by a specific version of the tools
189
+ * extension library and can change between different versions of the Tools Extension
190
+ * library.
191
+ *
192
+ * \par Initializing the Attributes
193
+ *
194
+ * The caller should always perform the following three tasks when using
195
+ * attributes:
196
+ * <ul>
197
+ * <li>Zero the structure
198
+ * <li>Set the version field
199
+ * <li>Set the size field
200
+ * </ul>
201
+ *
202
+ * Zeroing the structure sets all the event attributes types and values
203
+ * to the default value.
204
+ *
205
+ * The version and size field are used by the Tools Extension
206
+ * implementation to handle multiple versions of the attributes structure.
207
+ *
208
+ * It is recommended that the caller use one of the following to methods
209
+ * to initialize the event attributes structure:
210
+ *
211
+ * \par Method 1: Initializing nvtxEventAttributes for future compatibility
212
+ * \code
213
+ * nvtxSyncUserAttributes_t attribs = {0};
214
+ * attribs.version = NVTX_VERSION;
215
+ * attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
216
+ * \endcode
217
+ *
218
+ * \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version
219
+ * \code
220
+ * nvtxSyncUserAttributes_t attribs = {0};
221
+ * attribs.version = 1;
222
+ * attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t));
223
+ * \endcode
224
+ *
225
+ * If the caller uses Method 1 it is critical that the entire binary
226
+ * layout of the structure be configured to 0 so that all fields
227
+ * are initialized to the default value.
228
+ *
229
+ * The caller should either use both NVTX_VERSION and
230
+ * NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
231
+ * and a versioned type (Method 2). Using a mix of the two methods
232
+ * will likely cause either source level incompatibility or binary
233
+ * incompatibility in the future.
234
+ *
235
+ * \par Settings Attribute Types and Values
236
+ *
237
+ *
238
+ * \par Example:
239
+ * \code
240
+ * // Initialize
241
+ * nvtxSyncUserAttributes_t attribs = {0};
242
+ * attribs.version = NVTX_VERSION;
243
+ * attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
244
+ *
245
+ * // Configure the Attributes
246
+ * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
247
+ * attribs.message.ascii = "Example";
248
+ * \endcode
249
+ *
250
+ * \sa
251
+ * ::nvtxDomainSyncUserCreate
252
+ */
253
+ typedef struct nvtxSyncUserAttributes_v0
254
+ {
255
+ /**
256
+ * \brief Version flag of the structure.
257
+ *
258
+ * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
259
+ * supported in this header file. This can optionally be overridden to
260
+ * another version of the tools extension library.
261
+ */
262
+ uint16_t version;
263
+
264
+ /**
265
+ * \brief Size of the structure.
266
+ *
267
+ * Needs to be set to the size in bytes of the event attribute
268
+ * structure used to specify the event.
269
+ */
270
+ uint16_t size;
271
+
272
+ /** \brief Message type specified in this attribute structure.
273
+ *
274
+ * Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message
275
+ * "message" field.
276
+ *
277
+ * Default Value is NVTX_MESSAGE_UNKNOWN
278
+ */
279
+ int32_t messageType; /* nvtxMessageType_t */
280
+
281
+ /** \brief Message assigned to this attribute structure.
282
+ *
283
+ * The text message that is attached to an event.
284
+ */
285
+ nvtxMessageValue_t message;
286
+
287
+ } nvtxSyncUserAttributes_v0;
288
+
289
+ typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
290
+
291
+ /* ------------------------------------------------------------------------- */
292
+ /** \brief Create a user defined synchronization object
293
+ * This is used to track non-OS synchronization working with spinlocks and atomics
294
+ *
295
+ * \param domain - Domain to own the resource
296
+ * \param attribs - A structure to assign multiple attributes to the object.
297
+ *
298
+ * \return A handle that represents the newly created user defined synchronization object.
299
+ *
300
+ * \sa
301
+ * ::nvtxDomainSyncUserCreate
302
+ * ::nvtxDomainSyncUserDestroy
303
+ * ::nvtxDomainSyncUserAcquireStart
304
+ * ::nvtxDomainSyncUserAcquireFailed
305
+ * ::nvtxDomainSyncUserAcquireSuccess
306
+ * ::nvtxDomainSyncUserReleasing
307
+ *
308
+ * \version \NVTX_VERSION_2
309
+ */
310
+ NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
311
+
312
+ /* ------------------------------------------------------------------------- */
313
+ /** \brief Destroy a user defined synchronization object
314
+ * This is used to track non-OS synchronization working with spinlocks and atomics
315
+ *
316
+ * \param handle - A handle to the object to operate on.
317
+ *
318
+ * \sa
319
+ * ::nvtxDomainSyncUserCreate
320
+ * ::nvtxDomainSyncUserDestroy
321
+ * ::nvtxDomainSyncUserAcquireStart
322
+ * ::nvtxDomainSyncUserAcquireFailed
323
+ * ::nvtxDomainSyncUserAcquireSuccess
324
+ * ::nvtxDomainSyncUserReleasing
325
+ *
326
+ * \version \NVTX_VERSION_2
327
+ */
328
+ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle);
329
+
330
+ /* ------------------------------------------------------------------------- */
331
+ /** \brief Signal to tools that an attempt to acquire a user defined synchronization object
332
+ *
333
+ * \param handle - A handle to the object to operate on.
334
+ *
335
+ * \sa
336
+ * ::nvtxDomainSyncUserCreate
337
+ * ::nvtxDomainSyncUserDestroy
338
+ * ::nvtxDomainSyncUserAcquireStart
339
+ * ::nvtxDomainSyncUserAcquireFailed
340
+ * ::nvtxDomainSyncUserAcquireSuccess
341
+ * ::nvtxDomainSyncUserReleasing
342
+ *
343
+ * \version \NVTX_VERSION_2
344
+ */
345
+ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle);
346
+
347
+ /* ------------------------------------------------------------------------- */
348
+ /** \brief Signal to tools of failure in acquiring a user defined synchronization object
349
+ * This should be called after \ref nvtxDomainSyncUserAcquireStart
350
+ *
351
+ * \param handle - A handle to the object to operate on.
352
+ *
353
+ * \sa
354
+ * ::nvtxDomainSyncUserCreate
355
+ * ::nvtxDomainSyncUserDestroy
356
+ * ::nvtxDomainSyncUserAcquireStart
357
+ * ::nvtxDomainSyncUserAcquireFailed
358
+ * ::nvtxDomainSyncUserAcquireSuccess
359
+ * ::nvtxDomainSyncUserReleasing
360
+ *
361
+ * \version \NVTX_VERSION_2
362
+ */NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle);
363
+
364
+ /* ------------------------------------------------------------------------- */
365
+ /** \brief Signal to tools of success in acquiring a user defined synchronization object
366
+ * This should be called after \ref nvtxDomainSyncUserAcquireStart.
367
+ *
368
+ * \param handle - A handle to the object to operate on.
369
+ *
370
+ * \sa
371
+ * ::nvtxDomainSyncUserCreate
372
+ * ::nvtxDomainSyncUserDestroy
373
+ * ::nvtxDomainSyncUserAcquireStart
374
+ * ::nvtxDomainSyncUserAcquireFailed
375
+ * ::nvtxDomainSyncUserAcquireSuccess
376
+ * ::nvtxDomainSyncUserReleasing
377
+ *
378
+ * \version \NVTX_VERSION_2
379
+ */NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle);
380
+
381
+ /* ------------------------------------------------------------------------- */
382
+ /** \brief Signal to tools of releasing a reservation on user defined synchronization object
383
+ * This should be called after \ref nvtxDomainSyncUserAcquireSuccess.
384
+ *
385
+ * \param handle - A handle to the object to operate on.
386
+ *
387
+ * \sa
388
+ * ::nvtxDomainSyncUserCreate
389
+ * ::nvtxDomainSyncUserDestroy
390
+ * ::nvtxDomainSyncUserAcquireStart
391
+ * ::nvtxDomainSyncUserAcquireFailed
392
+ * ::nvtxDomainSyncUserAcquireSuccess
393
+ * ::nvtxDomainSyncUserReleasing
394
+ *
395
+ * \version \NVTX_VERSION_2
396
+ */
397
+ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
398
+
399
+
400
+ /** @} */ /*END defgroup*/
401
+
402
+ #ifdef __cplusplus
403
+ }
404
+ #endif /* __cplusplus */
405
+
406
+ #endif /* NVTOOLSEXT_SYNC_H_ */
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExt.h ADDED
@@ -0,0 +1,1499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2016 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ /** \file nvToolsExt.h
39
+ */
40
+
41
+ /* ========================================================================= */
42
+ /** \mainpage
43
+ * \tableofcontents
44
+ * \section INTRODUCTION Introduction
45
+ *
46
+ * The NVIDIA Tools Extension library is a set of functions that a
47
+ * developer can use to provide additional information to tools.
48
+ * The additional information is used by the tool to improve
49
+ * analysis and visualization of data.
50
+ *
51
+ * The library introduces close to zero overhead if no tool is
52
+ * attached to the application. The overhead when a tool is
53
+ * attached is specific to the tool.
54
+ *
55
+ * \section INITIALIZATION_SECTION Initialization
56
+ *
57
+ * Typically the tool's library that plugs into NVTX is indirectly
58
+ * loaded via enviromental properties that are platform specific.
59
+ * For some platform or special cases, the user may be required
60
+ * to instead explicity initialize instead though. This can also
61
+ * be helpful to control when the API loads a tool's library instead
62
+ * of what would typically be the first function call to emit info.
63
+ * For these rare case, see \ref INITIALIZATION for additional information.
64
+ *
65
+ * \section MARKERS_AND_RANGES Markers and Ranges
66
+ *
67
+ * Markers and ranges are used to describe events at a specific time (markers)
68
+ * or over a time span (ranges) during the execution of the application
69
+ * respectively.
70
+ *
71
+ * \subsection MARKERS Markers
72
+ *
73
+ * Markers denote specific moments in time.
74
+ *
75
+ *
76
+ * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
77
+ * how to specify the domain.
78
+ *
79
+ * \subsection THREAD_RANGES Thread Ranges
80
+ *
81
+ * Thread ranges denote nested time ranges. Nesting is maintained per thread
82
+ * per domain and does not require any additional correlation mechanism. The
83
+ * duration of a thread range is defined by the corresponding pair of
84
+ * nvtxRangePush* to nvtxRangePop API calls.
85
+ *
86
+ * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
87
+ * how to specify the domain.
88
+ *
89
+ * \subsection PROCESS_RANGES Process Ranges
90
+ *
91
+ * Process ranges denote a time span that can expose arbitrary concurrency, as
92
+ * opposed to thread ranges that only support nesting. In addition the range
93
+ * start event can happen on a different thread than the end marker. For the
94
+ * correlation of a start/end pair an unique correlation ID is used that is
95
+ * returned from the start API call and needs to be passed into the end API
96
+ * call.
97
+ *
98
+ * \subsection EVENT_ATTRIBUTES Event Attributes
99
+ *
100
+ * \ref MARKERS_AND_RANGES can be annotated with various attributes to provide
101
+ * additional information for an event or to guide the tool's visualization of
102
+ * the data. Each of the attributes is optional and if left unused the
103
+ * attributes fall back to a default value. The attributes include:
104
+ * - color
105
+ * - category
106
+ *
107
+ * To specify any attribute other than the text message, the \ref
108
+ * EVENT_ATTRIBUTE_STRUCTURE "Event Attribute Structure" must be used.
109
+ *
110
+ * \section DOMAINS Domains
111
+ *
112
+ * Domains enable developers to scope annotations. By default all events and
113
+ * annotations are in the default domain. Additional domains can be registered.
114
+ * This allows developers to scope markers, ranges, and resources names to
115
+ * avoid conflicts.
116
+ *
117
+ * The function ::nvtxDomainCreateA or ::nvtxDomainCreateW is used to create
118
+ * a named domain.
119
+ *
120
+ * Each domain maintains its own
121
+ * - categories
122
+ * - thread range stacks
123
+ * - registered strings
124
+ *
125
+ * The function ::nvtxDomainDestroy marks the end of the domain. Destroying
126
+ * a domain unregisters and destroys all objects associated with it such as
127
+ * registered strings, resource objects, named categories, and started ranges.
128
+ *
129
+ * \section RESOURCE_NAMING Resource Naming
130
+ *
131
+ * This section covers calls that allow to annotate objects with user-provided
132
+ * names in order to allow for a better analysis of complex trace data. All of
133
+ * the functions take the handle or the ID of the object to name and the name.
134
+ * The functions can be called multiple times during the execution of an
135
+ * application, however, in that case it is implementation dependent which
136
+ * name will be reported by the tool.
137
+ *
138
+ * \subsection CATEGORY_NAMING Category Naming
139
+ *
140
+ * Some function in this library support associating an integer category
141
+ * to enable filtering and sorting. The category naming functions allow
142
+ * the application to associate a user friendly name with the integer
143
+ * category. Support for domains have been added in NVTX_VERSION_2 to
144
+ * avoid collisions when domains are developed independantly.
145
+ *
146
+ * \subsection RESOURCE_OBJECTS Resource Objects
147
+ *
148
+ * Resource objects are a generic mechanism for attaching data to an application
149
+ * resource. The identifier field makes the association to a pointer or handle,
150
+ * while the type field helps provide deeper understanding of the identifier as
151
+ * well as enabling differentiation in cases where handles generated by different
152
+ * APIs may collide. The resource object may also have an associated message to
153
+ * associate with the application resource, enabling further annotation of this
154
+ * object and how it is used.
155
+ *
156
+ * The resource object was introduced in NVTX_VERSION_2 to supersede existing naming
157
+ * functions and allow the application resource identified by those functions to be
158
+ * associated to a domain. The other naming functions are still supported for backward
159
+ * compatibility but will be associated only to the default domain.
160
+ *
161
+ * \subsection RESOURCE_NAMING_OS Resource Naming
162
+ *
163
+ * Some operating system resources creation APIs do not support providing a user friendly
164
+ * name, such as some OS thread creation APIs. This API support resource naming though
165
+ * both through resource objects and functions following the pattern
166
+ * nvtxName[RESOURCE_TYPE][A|W](identifier, name). Resource objects introduced in NVTX_VERSION 2
167
+ * supersede the other functions with a a more general method of assigning names to OS resources,
168
+ * along with associating them to domains too. The older nvtxName* functions are only associated
169
+ * with the default domain.
170
+ * \section EXTENSIONS Optional Extensions
171
+ * Optional extensions will either appear within the existing sections the extend or appear
172
+ * in the "Related Pages" when they introduce new concepts.
173
+ */
174
+
175
+ /**
176
+ * Tools Extension API version
177
+ */
178
+ #if defined(NVTX_VERSION) && NVTX_VERSION < 3
179
+ #error "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included. If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX. Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead."
180
+ #endif
181
+
182
+ /* Header guard */
183
+ #if !defined(NVTX_VERSION)
184
+ #define NVTX_VERSION 3
185
+
186
+ #if defined(_MSC_VER)
187
+ #define NVTX_API __stdcall
188
+ #define NVTX_INLINE_STATIC __inline static
189
+ #else /*defined(__GNUC__)*/
190
+ #define NVTX_API
191
+ #define NVTX_INLINE_STATIC inline static
192
+ #endif /* Platform */
193
+
194
+ #if defined(NVTX_NO_IMPL)
195
+ /* When omitting implementation, avoid declaring functions inline */
196
+ /* without definitions, since this causes compiler warnings. */
197
+ #define NVTX_DECLSPEC
198
+ #elif defined(NVTX_EXPORT_API)
199
+ /* Allow overriding definition of NVTX_DECLSPEC when exporting API. */
200
+ /* Default is empty, meaning non-inline with external linkage. */
201
+ #if !defined(NVTX_DECLSPEC)
202
+ #define NVTX_DECLSPEC
203
+ #endif
204
+ #else
205
+ /* Normal NVTX usage defines the NVTX API inline with static */
206
+ /* (internal) linkage. */
207
+ #define NVTX_DECLSPEC NVTX_INLINE_STATIC
208
+ #endif
209
+
210
+ #include "nvtxDetail/nvtxLinkOnce.h"
211
+
212
+ #define NVTX_VERSIONED_IDENTIFIER_L3(NAME, VERSION) NAME##_v##VERSION
213
+ #define NVTX_VERSIONED_IDENTIFIER_L2(NAME, VERSION) NVTX_VERSIONED_IDENTIFIER_L3(NAME, VERSION)
214
+ #define NVTX_VERSIONED_IDENTIFIER(NAME) NVTX_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION)
215
+
216
+ /**
217
+ * The nvToolsExt library depends on stdint.h. If the build tool chain in use
218
+ * does not include stdint.h then define NVTX_STDINT_TYPES_ALREADY_DEFINED
219
+ * and define the following types:
220
+ * <ul>
221
+ * <li>uint8_t
222
+ * <li>int8_t
223
+ * <li>uint16_t
224
+ * <li>int16_t
225
+ * <li>uint32_t
226
+ * <li>int32_t
227
+ * <li>uint64_t
228
+ * <li>int64_t
229
+ * <li>uintptr_t
230
+ * <li>intptr_t
231
+ * </ul>
232
+ * #define NVTX_STDINT_TYPES_ALREADY_DEFINED if you are using your own header file.
233
+ */
234
+ #ifndef NVTX_STDINT_TYPES_ALREADY_DEFINED
235
+ #include <stdint.h>
236
+ #endif
237
+
238
+ #include <stddef.h>
239
+
240
+ #ifdef __cplusplus
241
+ extern "C" {
242
+ #endif /* __cplusplus */
243
+
244
+ /**
245
+ * Result Codes
246
+ */
247
+
248
+ #define NVTX_SUCCESS 0
249
+ #define NVTX_FAIL 1
250
+ #define NVTX_ERR_INIT_LOAD_PROPERTY 2
251
+ #define NVTX_ERR_INIT_ACCESS_LIBRARY 3
252
+ #define NVTX_ERR_INIT_LOAD_LIBRARY 4
253
+ #define NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT 5
254
+ #define NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT 6
255
+ #define NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE 7
256
+
257
+ /**
258
+ * Size of the nvtxEventAttributes_t structure.
259
+ */
260
+ #define NVTX_EVENT_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxEventAttributes_t) ) )
261
+
262
+ #define NVTX_NO_PUSH_POP_TRACKING ((int)-2)
263
+
264
+ typedef uint64_t nvtxRangeId_t;
265
+
266
+ /* Forward declaration of opaque domain registration structure */
267
+ struct nvtxDomainRegistration_st;
268
+ typedef struct nvtxDomainRegistration_st nvtxDomainRegistration;
269
+
270
+ /* \brief Domain Handle Structure.
271
+ * \anchor DOMAIN_HANDLE_STRUCTURE
272
+ *
273
+ * This structure is opaque to the user and is used as a handle to reference
274
+ * a domain. This type is returned from tools when using the NVTX API to
275
+ * create a domain.
276
+ *
277
+ */
278
+ typedef nvtxDomainRegistration* nvtxDomainHandle_t;
279
+
280
+ /* Forward declaration of opaque string registration structure */
281
+ struct nvtxStringRegistration_st;
282
+ typedef struct nvtxStringRegistration_st nvtxStringRegistration;
283
+
284
+ /* \brief Registered String Handle Structure.
285
+ * \anchor REGISTERED_STRING_HANDLE_STRUCTURE
286
+ *
287
+ * This structure is opaque to the user and is used as a handle to reference
288
+ * a registered string. This type is returned from tools when using the NVTX
289
+ * API to create a registered string.
290
+ *
291
+ */
292
+ typedef nvtxStringRegistration* nvtxStringHandle_t;
293
+
294
+ /* ========================================================================= */
295
+ /** \defgroup GENERAL General
296
+ * @{
297
+ */
298
+
299
+ /** ---------------------------------------------------------------------------
300
+ * Color Types
301
+ * ------------------------------------------------------------------------- */
302
+ typedef enum nvtxColorType_t
303
+ {
304
+ NVTX_COLOR_UNKNOWN = 0, /**< Color attribute is unused. */
305
+ NVTX_COLOR_ARGB = 1 /**< An ARGB color is provided. */
306
+ } nvtxColorType_t;
307
+
308
+ /** ---------------------------------------------------------------------------
309
+ * Message Types
310
+ * ------------------------------------------------------------------------- */
311
+ typedef enum nvtxMessageType_t
312
+ {
313
+ NVTX_MESSAGE_UNKNOWN = 0, /**< Message payload is unused. */
314
+ NVTX_MESSAGE_TYPE_ASCII = 1, /**< A character sequence is used as payload. */
315
+ NVTX_MESSAGE_TYPE_UNICODE = 2, /**< A wide character sequence is used as payload. */
316
+ /* NVTX_VERSION_2 */
317
+ NVTX_MESSAGE_TYPE_REGISTERED = 3, /**< A unique string handle that was registered
318
+ with \ref nvtxDomainRegisterStringA() or
319
+ \ref nvtxDomainRegisterStringW(). */
320
+ } nvtxMessageType_t;
321
+
322
+ typedef union nvtxMessageValue_t
323
+ {
324
+ const char* ascii;
325
+ const wchar_t* unicode;
326
+ /* NVTX_VERSION_2 */
327
+ nvtxStringHandle_t registered;
328
+ } nvtxMessageValue_t;
329
+
330
+
331
+ /** @} */ /*END defgroup*/
332
+ /* ------------------------------------------------------------------------- */
333
+ /** \brief Force initialization (optional)
334
+ *
335
+ * Force NVTX library to initialize. The first call to any NVTX API function
336
+ * will automatically initialize the entire API. This can make the first call
337
+ * much slower than subsequent calls. In applications where the first call to
338
+ * NVTX may be in a performance-critical section, calling nvtxInitialize before
339
+ * any performance-critical sections will ensure NVTX initialization occurs at
340
+ * an acceptable time. Since nvtxInitialize takes no parameters and has no
341
+ * expected behavior besides initialization, it is convenient to add a call to
342
+ * nvtxInitialize in NVTX-instrumented applications that need to force earlier
343
+ * initialization without changing any other code. For example, if an app's
344
+ * first NVTX call is nvtxDomainCreate, and it is difficult to move that call
345
+ * earlier because the domain handle must be stored in an object only created
346
+ * at that point, adding a call to nvtxInitialize at the top of main() will
347
+ * ensure the later call to nvtxDomainCreate is as fast as possible.
348
+ *
349
+ * \version \NVTX_VERSION_3
350
+ *
351
+ * \param reserved - must be zero or NULL.
352
+ *
353
+ * @{ */
354
+ NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved);
355
+ /** @} */
356
+
357
+
358
+ /** @} */ /*END defgroup*/
359
+
360
+ /* ========================================================================= */
361
+ /** \defgroup EVENT_ATTRIBUTES Event Attributes
362
+ * @{
363
+ */
364
+
365
+ /** ---------------------------------------------------------------------------
366
+ * Payload Types
367
+ * ------------------------------------------------------------------------- */
368
+ typedef enum nvtxPayloadType_t
369
+ {
370
+ NVTX_PAYLOAD_UNKNOWN = 0, /**< Color payload is unused. */
371
+ NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 = 1, /**< A 64 bit unsigned integer value is used as payload. */
372
+ NVTX_PAYLOAD_TYPE_INT64 = 2, /**< A 64 bit signed integer value is used as payload. */
373
+ NVTX_PAYLOAD_TYPE_DOUBLE = 3, /**< A 64 bit floating point value is used as payload. */
374
+ /* NVTX_VERSION_2 */
375
+ NVTX_PAYLOAD_TYPE_UNSIGNED_INT32 = 4, /**< A 32 bit floating point value is used as payload. */
376
+ NVTX_PAYLOAD_TYPE_INT32 = 5, /**< A 32 bit floating point value is used as payload. */
377
+ NVTX_PAYLOAD_TYPE_FLOAT = 6 /**< A 32 bit floating point value is used as payload. */
378
+ } nvtxPayloadType_t;
379
+
380
+ /** \brief Event Attribute Structure.
381
+ * \anchor EVENT_ATTRIBUTE_STRUCTURE
382
+ *
383
+ * This structure is used to describe the attributes of an event. The layout of
384
+ * the structure is defined by a specific version of the tools extension
385
+ * library and can change between different versions of the Tools Extension
386
+ * library.
387
+ *
388
+ * \par Initializing the Attributes
389
+ *
390
+ * The caller should always perform the following three tasks when using
391
+ * attributes:
392
+ * <ul>
393
+ * <li>Zero the structure
394
+ * <li>Set the version field
395
+ * <li>Set the size field
396
+ * </ul>
397
+ *
398
+ * Zeroing the structure sets all the event attributes types and values
399
+ * to the default value.
400
+ *
401
+ * The version and size field are used by the Tools Extension
402
+ * implementation to handle multiple versions of the attributes structure.
403
+ *
404
+ * It is recommended that the caller use one of the following to methods
405
+ * to initialize the event attributes structure:
406
+ *
407
+ * \par Method 1: Initializing nvtxEventAttributes for future compatibility
408
+ * \code
409
+ * nvtxEventAttributes_t eventAttrib = {0};
410
+ * eventAttrib.version = NVTX_VERSION;
411
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
412
+ * \endcode
413
+ *
414
+ * \par Method 2: Initializing nvtxEventAttributes for a specific version
415
+ * \code
416
+ * nvtxEventAttributes_t eventAttrib = {0};
417
+ * eventAttrib.version = 1;
418
+ * eventAttrib.size = (uint16_t)(sizeof(nvtxEventAttributes_v1));
419
+ * \endcode
420
+ *
421
+ * If the caller uses Method 1 it is critical that the entire binary
422
+ * layout of the structure be configured to 0 so that all fields
423
+ * are initialized to the default value.
424
+ *
425
+ * The caller should either use both NVTX_VERSION and
426
+ * NVTX_EVENT_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
427
+ * and a versioned type (Method 2). Using a mix of the two methods
428
+ * will likely cause either source level incompatibility or binary
429
+ * incompatibility in the future.
430
+ *
431
+ * \par Settings Attribute Types and Values
432
+ *
433
+ *
434
+ * \par Example:
435
+ * \code
436
+ * // Initialize
437
+ * nvtxEventAttributes_t eventAttrib = {0};
438
+ * eventAttrib.version = NVTX_VERSION;
439
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
440
+ *
441
+ * // Configure the Attributes
442
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
443
+ * eventAttrib.color = 0xFF880000;
444
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
445
+ * eventAttrib.message.ascii = "Example";
446
+ * \endcode
447
+ *
448
+ * In the example the caller does not have to set the value of
449
+ * \ref ::nvtxEventAttributes_v2::category or
450
+ * \ref ::nvtxEventAttributes_v2::payload as these fields were set to
451
+ * the default value by {0}.
452
+ * \sa
453
+ * ::nvtxDomainMarkEx
454
+ * ::nvtxDomainRangeStartEx
455
+ * ::nvtxDomainRangePushEx
456
+ */
457
+ typedef struct nvtxEventAttributes_v2
458
+ {
459
+ /**
460
+ * \brief Version flag of the structure.
461
+ *
462
+ * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
463
+ * supported in this header file. This can optionally be overridden to
464
+ * another version of the tools extension library.
465
+ */
466
+ uint16_t version;
467
+
468
+ /**
469
+ * \brief Size of the structure.
470
+ *
471
+ * Needs to be set to the size in bytes of the event attribute
472
+ * structure used to specify the event.
473
+ */
474
+ uint16_t size;
475
+
476
+ /**
477
+ * \brief ID of the category the event is assigned to.
478
+ *
479
+ * A category is a user-controlled ID that can be used to group
480
+ * events. The tool may use category IDs to improve filtering or
481
+ * enable grouping of events in the same category. The functions
482
+ * \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used
483
+ * to name a category.
484
+ *
485
+ * Default Value is 0
486
+ */
487
+ uint32_t category;
488
+
489
+ /** \brief Color type specified in this attribute structure.
490
+ *
491
+ * Defines the color format of the attribute structure's \ref COLOR_FIELD
492
+ * "color" field.
493
+ *
494
+ * Default Value is NVTX_COLOR_UNKNOWN
495
+ */
496
+ int32_t colorType; /* nvtxColorType_t */
497
+
498
+ /** \brief Color assigned to this event. \anchor COLOR_FIELD
499
+ *
500
+ * The color that the tool should use to visualize the event.
501
+ */
502
+ uint32_t color;
503
+
504
+ /**
505
+ * \brief Payload type specified in this attribute structure.
506
+ *
507
+ * Defines the payload format of the attribute structure's \ref PAYLOAD_FIELD
508
+ * "payload" field.
509
+ *
510
+ * Default Value is NVTX_PAYLOAD_UNKNOWN
511
+ */
512
+ int32_t payloadType; /* nvtxPayloadType_t */
513
+
514
+ int32_t reserved0;
515
+
516
+ /**
517
+ * \brief Payload assigned to this event. \anchor PAYLOAD_FIELD
518
+ *
519
+ * A numerical value that can be used to annotate an event. The tool could
520
+ * use the payload data to reconstruct graphs and diagrams.
521
+ */
522
+ union payload_t
523
+ {
524
+ uint64_t ullValue;
525
+ int64_t llValue;
526
+ double dValue;
527
+ /* NVTX_VERSION_2 */
528
+ uint32_t uiValue;
529
+ int32_t iValue;
530
+ float fValue;
531
+ } payload;
532
+
533
+ /** \brief Message type specified in this attribute structure.
534
+ *
535
+ * Defines the message format of the attribute structure's \ref MESSAGE_FIELD
536
+ * "message" field.
537
+ *
538
+ * Default Value is NVTX_MESSAGE_UNKNOWN
539
+ */
540
+ int32_t messageType; /* nvtxMessageType_t */
541
+
542
+ /** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD
543
+ *
544
+ * The text message that is attached to an event.
545
+ */
546
+ nvtxMessageValue_t message;
547
+
548
+ } nvtxEventAttributes_v2;
549
+
550
+ typedef struct nvtxEventAttributes_v2 nvtxEventAttributes_t;
551
+
552
+ /** @} */ /*END defgroup*/
553
+ /* ========================================================================= */
554
+ /** \defgroup MARKERS_AND_RANGES Markers and Ranges
555
+ *
556
+ * See \ref MARKERS_AND_RANGES for more details
557
+ *
558
+ * @{
559
+ */
560
+
561
+ /** \name Marker */
562
+
563
+ /* ------------------------------------------------------------------------- */
564
+ /** \brief Marks an instantaneous event in the application.
565
+ *
566
+ * A marker can contain a text message or specify additional information
567
+ * using the event attributes structure. These attributes include a text
568
+ * message, color, category, and a payload. Each of the attributes is optional
569
+ * and can only be sent out using the \ref nvtxDomainMarkEx function.
570
+ *
571
+ * nvtxDomainMarkEx(NULL, event) is equivalent to calling
572
+ * nvtxMarkEx(event).
573
+ *
574
+ * \param domain - The domain of scoping the category.
575
+ * \param eventAttrib - The event attribute structure defining the marker's
576
+ * attribute types and attribute values.
577
+ *
578
+ * \sa
579
+ * ::nvtxMarkEx
580
+ *
581
+ * \version \NVTX_VERSION_2
582
+ * @{ */
583
+ NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
584
+ /** @} */
585
+
586
+ /* ------------------------------------------------------------------------- */
587
+ /** \brief Marks an instantaneous event in the application.
588
+ *
589
+ * A marker can contain a text message or specify additional information
590
+ * using the event attributes structure. These attributes include a text
591
+ * message, color, category, and a payload. Each of the attributes is optional
592
+ * and can only be sent out using the \ref nvtxMarkEx function.
593
+ * If \ref nvtxMarkA or \ref nvtxMarkW are used to specify the marker
594
+ * or if an attribute is unspecified then a default value will be used.
595
+ *
596
+ * \param eventAttrib - The event attribute structure defining the marker's
597
+ * attribute types and attribute values.
598
+ *
599
+ * \par Example:
600
+ * \code
601
+ * // zero the structure
602
+ * nvtxEventAttributes_t eventAttrib = {0};
603
+ * // set the version and the size information
604
+ * eventAttrib.version = NVTX_VERSION;
605
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
606
+ * // configure the attributes. 0 is the default for all attributes.
607
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
608
+ * eventAttrib.color = 0xFF880000;
609
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
610
+ * eventAttrib.message.ascii = "Example nvtxMarkEx";
611
+ * nvtxMarkEx(&eventAttrib);
612
+ * \endcode
613
+ *
614
+ * \sa
615
+ * ::nvtxDomainMarkEx
616
+ *
617
+ * \version \NVTX_VERSION_1
618
+ * @{ */
619
+ NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib);
620
+ /** @} */
621
+
622
+ /* ------------------------------------------------------------------------- */
623
+ /** \brief Marks an instantaneous event in the application.
624
+ *
625
+ * A marker created using \ref nvtxMarkA or \ref nvtxMarkW contains only a
626
+ * text message.
627
+ *
628
+ * \param message - The message associated to this marker event.
629
+ *
630
+ * \par Example:
631
+ * \code
632
+ * nvtxMarkA("Example nvtxMarkA");
633
+ * nvtxMarkW(L"Example nvtxMarkW");
634
+ * \endcode
635
+ *
636
+ * \sa
637
+ * ::nvtxDomainMarkEx
638
+ * ::nvtxMarkEx
639
+ *
640
+ * \version \NVTX_VERSION_0
641
+ * @{ */
642
+ NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message);
643
+ NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message);
644
+ /** @} */
645
+
646
+
647
+ /** \name Process Ranges */
648
+
649
+ /* ------------------------------------------------------------------------- */
650
+ /** \brief Starts a process range in a domain.
651
+ *
652
+ * \param domain - The domain of scoping the category.
653
+ * \param eventAttrib - The event attribute structure defining the range's
654
+ * attribute types and attribute values.
655
+ *
656
+ * \return The unique ID used to correlate a pair of Start and End events.
657
+ *
658
+ * \remarks Ranges defined by Start/End can overlap.
659
+ *
660
+ * \par Example:
661
+ * \code
662
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain");
663
+ * nvtxEventAttributes_t eventAttrib = {0};
664
+ * eventAttrib.version = NVTX_VERSION;
665
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
666
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
667
+ * eventAttrib.message.ascii = "my range";
668
+ * nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib);
669
+ * // ...
670
+ * nvtxDomainRangeEnd(rangeId);
671
+ * \endcode
672
+ *
673
+ * \sa
674
+ * ::nvtxDomainRangeEnd
675
+ *
676
+ * \version \NVTX_VERSION_2
677
+ * @{ */
678
+ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
679
+ /** @} */
680
+
681
+ /* ------------------------------------------------------------------------- */
682
+ /** \brief Starts a process range.
683
+ *
684
+ * \param eventAttrib - The event attribute structure defining the range's
685
+ * attribute types and attribute values.
686
+ *
687
+ * \return The unique ID used to correlate a pair of Start and End events.
688
+ *
689
+ * \remarks Ranges defined by Start/End can overlap.
690
+ *
691
+ * \par Example:
692
+ * \code
693
+ * nvtxEventAttributes_t eventAttrib = {0};
694
+ * eventAttrib.version = NVTX_VERSION;
695
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
696
+ * eventAttrib.category = 3;
697
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
698
+ * eventAttrib.color = 0xFF0088FF;
699
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
700
+ * eventAttrib.message.ascii = "Example Range";
701
+ * nvtxRangeId_t rangeId = nvtxRangeStartEx(&eventAttrib);
702
+ * // ...
703
+ * nvtxRangeEnd(rangeId);
704
+ * \endcode
705
+ *
706
+ * \sa
707
+ * ::nvtxRangeEnd
708
+ * ::nvtxDomainRangeStartEx
709
+ *
710
+ * \version \NVTX_VERSION_1
711
+ * @{ */
712
+ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib);
713
+ /** @} */
714
+
715
+ /* ------------------------------------------------------------------------- */
716
+ /** \brief Starts a process range.
717
+ *
718
+ * \param message - The event message associated to this range event.
719
+ *
720
+ * \return The unique ID used to correlate a pair of Start and End events.
721
+ *
722
+ * \remarks Ranges defined by Start/End can overlap.
723
+ *
724
+ * \par Example:
725
+ * \code
726
+ * nvtxRangeId_t r1 = nvtxRangeStartA("Range 1");
727
+ * nvtxRangeId_t r2 = nvtxRangeStartW(L"Range 2");
728
+ * nvtxRangeEnd(r1);
729
+ * nvtxRangeEnd(r2);
730
+ * \endcode
731
+ *
732
+ * \sa
733
+ * ::nvtxRangeEnd
734
+ * ::nvtxRangeStartEx
735
+ * ::nvtxDomainRangeStartEx
736
+ *
737
+ * \version \NVTX_VERSION_0
738
+ * @{ */
739
+ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message);
740
+ NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message);
741
+ /** @} */
742
+
743
+ /* ------------------------------------------------------------------------- */
744
+ /** \brief Ends a process range.
745
+ *
746
+ * \param domain - The domain
747
+ * \param id - The correlation ID returned from a nvtxRangeStart call.
748
+ *
749
+ * \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd.
750
+ * It does not need a domain param since that is associated iwth the range ID at ::nvtxDomainRangeStartEx
751
+ *
752
+ * \par Example:
753
+ * \code
754
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain");
755
+ * nvtxEventAttributes_t eventAttrib = {0};
756
+ * eventAttrib.version = NVTX_VERSION;
757
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
758
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
759
+ * eventAttrib.message.ascii = "my range";
760
+ * nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib);
761
+ * // ...
762
+ * nvtxDomainRangeEnd(rangeId);
763
+ * \endcode
764
+ *
765
+ * \sa
766
+ * ::nvtxDomainRangeStartEx
767
+ *
768
+ * \version \NVTX_VERSION_2
769
+ * @{ */
770
+ NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id);
771
+ /** @} */
772
+
773
+ /* ------------------------------------------------------------------------- */
774
+ /** \brief Ends a process range.
775
+ *
776
+ * \param id - The correlation ID returned from an nvtxRangeStart call.
777
+ *
778
+ * \sa
779
+ * ::nvtxDomainRangeStartEx
780
+ * ::nvtxRangeStartEx
781
+ * ::nvtxRangeStartA
782
+ * ::nvtxRangeStartW
783
+ *
784
+ * \version \NVTX_VERSION_0
785
+ * @{ */
786
+ NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id);
787
+ /** @} */
788
+
789
+ /** \name Thread Ranges */
790
+
791
+ /* ------------------------------------------------------------------------- */
792
+ /** \brief Starts a nested thread range.
793
+ *
794
+ * \param domain - The domain of scoping.
795
+ * \param eventAttrib - The event attribute structure defining the range's
796
+ * attribute types and attribute values.
797
+ *
798
+ * \return The 0 based level of range being started. This value is scoped to the domain.
799
+ * If an error occurs, a negative value is returned.
800
+ *
801
+ * \par Example:
802
+ * \code
803
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
804
+ * nvtxEventAttributes_t eventAttrib = {0};
805
+ * eventAttrib.version = NVTX_VERSION;
806
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
807
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
808
+ * eventAttrib.color = 0xFFFF0000;
809
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
810
+ * eventAttrib.message.ascii = "Level 0";
811
+ * nvtxDomainRangePushEx(domain, &eventAttrib);
812
+ *
813
+ * // Re-use eventAttrib
814
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE;
815
+ * eventAttrib.message.unicode = L"Level 1";
816
+ * nvtxDomainRangePushEx(domain, &eventAttrib);
817
+ *
818
+ * nvtxDomainRangePop(domain); //level 1
819
+ * nvtxDomainRangePop(domain); //level 0
820
+ * \endcode
821
+ *
822
+ * \sa
823
+ * ::nvtxDomainRangePop
824
+ *
825
+ * \version \NVTX_VERSION_2
826
+ * @{ */
827
+ NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
828
+ /** @} */
829
+
830
+ /* ------------------------------------------------------------------------- */
831
+ /** \brief Starts a nested thread range.
832
+ *
833
+ * \param eventAttrib - The event attribute structure defining the range's
834
+ * attribute types and attribute values.
835
+ *
836
+ * \return The 0 based level of range being started. This level is per domain.
837
+ * If an error occurs a negative value is returned.
838
+ *
839
+ * \par Example:
840
+ * \code
841
+ * nvtxEventAttributes_t eventAttrib = {0};
842
+ * eventAttrib.version = NVTX_VERSION;
843
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
844
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
845
+ * eventAttrib.color = 0xFFFF0000;
846
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
847
+ * eventAttrib.message.ascii = "Level 0";
848
+ * nvtxRangePushEx(&eventAttrib);
849
+ *
850
+ * // Re-use eventAttrib
851
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE;
852
+ * eventAttrib.message.unicode = L"Level 1";
853
+ * nvtxRangePushEx(&eventAttrib);
854
+ *
855
+ * nvtxRangePop();
856
+ * nvtxRangePop();
857
+ * \endcode
858
+ *
859
+ * \sa
860
+ * ::nvtxDomainRangePushEx
861
+ * ::nvtxRangePop
862
+ *
863
+ * \version \NVTX_VERSION_1
864
+ * @{ */
865
+ NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib);
866
+ /** @} */
867
+
868
+ /* ------------------------------------------------------------------------- */
869
+ /** \brief Starts a nested thread range.
870
+ *
871
+ * \param message - The event message associated to this range event.
872
+ *
873
+ * \return The 0 based level of range being started. If an error occurs a
874
+ * negative value is returned.
875
+ *
876
+ * \par Example:
877
+ * \code
878
+ * nvtxRangePushA("Level 0");
879
+ * nvtxRangePushW(L"Level 1");
880
+ * nvtxRangePop();
881
+ * nvtxRangePop();
882
+ * \endcode
883
+ *
884
+ * \sa
885
+ * ::nvtxDomainRangePushEx
886
+ * ::nvtxRangePop
887
+ *
888
+ * \version \NVTX_VERSION_0
889
+ * @{ */
890
+ NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message);
891
+ NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message);
892
+ /** @} */
893
+
894
+
895
+ /* ------------------------------------------------------------------------- */
896
+ /** \brief Ends a nested thread range.
897
+ *
898
+ * \return The level of the range being ended. If an error occurs a negative
899
+ * value is returned on the current thread.
900
+ *
901
+ * \par Example:
902
+ * \code
903
+ * nvtxDomainHandle_t domain = nvtxDomainCreate("example library");
904
+ * nvtxDomainRangePushA(domain, "Level 0");
905
+ * nvtxDomainRangePushW(domain, L"Level 1");
906
+ * nvtxDomainRangePop(domain);
907
+ * nvtxDomainRangePop(domain);
908
+ * \endcode
909
+ *
910
+ * \sa
911
+ * ::nvtxRangePushEx
912
+ * ::nvtxRangePushA
913
+ * ::nvtxRangePushW
914
+ *
915
+ * \version \NVTX_VERSION_2
916
+ * @{ */
917
+ NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain);
918
+ /** @} */
919
+
920
+ /* ------------------------------------------------------------------------- */
921
+ /** \brief Ends a nested thread range.
922
+ *
923
+ * \return The level of the range being ended. If an error occurs a negative
924
+ * value is returned on the current thread.
925
+ *
926
+ * \par Example:
927
+ * \code
928
+ * nvtxRangePushA("Level 0");
929
+ * nvtxRangePushW(L"Level 1");
930
+ * nvtxRangePop();
931
+ * nvtxRangePop();
932
+ * \endcode
933
+ *
934
+ * \sa
935
+ * ::nvtxRangePushEx
936
+ * ::nvtxRangePushA
937
+ * ::nvtxRangePushW
938
+ *
939
+ * \version \NVTX_VERSION_0
940
+ * @{ */
941
+ NVTX_DECLSPEC int NVTX_API nvtxRangePop(void);
942
+ /** @} */
943
+
944
+
945
+ /** @} */ /*END defgroup*/
946
+ /* ========================================================================= */
947
+ /** \defgroup RESOURCE_NAMING Resource Naming
948
+ *
949
+ * See \ref RESOURCE_NAMING for more details
950
+ *
951
+ * @{
952
+ */
953
+
954
+
955
+ /* ------------------------------------------------------------------------- */
956
+ /** \name Functions for Generic Resource Naming*/
957
+ /* ------------------------------------------------------------------------- */
958
+
959
+ /* ------------------------------------------------------------------------- */
960
+ /** \cond SHOW_HIDDEN
961
+ * \brief Resource typing helpers.
962
+ *
963
+ * Classes are used to make it easy to create a series of resource types
964
+ * per API without collisions
965
+ */
966
+ #define NVTX_RESOURCE_MAKE_TYPE(CLASS, INDEX) ((((uint32_t)(NVTX_RESOURCE_CLASS_ ## CLASS))<<16)|((uint32_t)(INDEX)))
967
+ #define NVTX_RESOURCE_CLASS_GENERIC 1
968
+ /** \endcond */
969
+
970
+ /* ------------------------------------------------------------------------- */
971
+ /** \brief Generic resource type for when a resource class is not available.
972
+ *
973
+ * \sa
974
+ * ::nvtxDomainResourceCreate
975
+ *
976
+ * \version \NVTX_VERSION_2
977
+ */
978
+ typedef enum nvtxResourceGenericType_t
979
+ {
980
+ NVTX_RESOURCE_TYPE_UNKNOWN = 0,
981
+ NVTX_RESOURCE_TYPE_GENERIC_POINTER = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 1), /**< Generic pointer assumed to have no collisions with other pointers. */
982
+ NVTX_RESOURCE_TYPE_GENERIC_HANDLE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 2), /**< Generic handle assumed to have no collisions with other handles. */
983
+ NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 3), /**< OS native thread identifier. */
984
+ NVTX_RESOURCE_TYPE_GENERIC_THREAD_POSIX = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 4) /**< POSIX pthread identifier. */
985
+ } nvtxResourceGenericType_t;
986
+
987
+
988
+
989
+ /** \brief Resource Attribute Structure.
990
+ * \anchor RESOURCE_ATTRIBUTE_STRUCTURE
991
+ *
992
+ * This structure is used to describe the attributes of a resource. The layout of
993
+ * the structure is defined by a specific version of the tools extension
994
+ * library and can change between different versions of the Tools Extension
995
+ * library.
996
+ *
997
+ * \par Initializing the Attributes
998
+ *
999
+ * The caller should always perform the following three tasks when using
1000
+ * attributes:
1001
+ * <ul>
1002
+ * <li>Zero the structure
1003
+ * <li>Set the version field
1004
+ * <li>Set the size field
1005
+ * </ul>
1006
+ *
1007
+ * Zeroing the structure sets all the resource attributes types and values
1008
+ * to the default value.
1009
+ *
1010
+ * The version and size field are used by the Tools Extension
1011
+ * implementation to handle multiple versions of the attributes structure.
1012
+ *
1013
+ * It is recommended that the caller use one of the following to methods
1014
+ * to initialize the event attributes structure:
1015
+ *
1016
+ * \par Method 1: Initializing nvtxEventAttributes for future compatibility
1017
+ * \code
1018
+ * nvtxResourceAttributes_t attribs = {0};
1019
+ * attribs.version = NVTX_VERSION;
1020
+ * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
1021
+ * \endcode
1022
+ *
1023
+ * \par Method 2: Initializing nvtxEventAttributes for a specific version
1024
+ * \code
1025
+ * nvtxResourceAttributes_v0 attribs = {0};
1026
+ * attribs.version = 2;
1027
+ * attribs.size = (uint16_t)(sizeof(nvtxResourceAttributes_v0));
1028
+ * \endcode
1029
+ *
1030
+ * If the caller uses Method 1 it is critical that the entire binary
1031
+ * layout of the structure be configured to 0 so that all fields
1032
+ * are initialized to the default value.
1033
+ *
1034
+ * The caller should either use both NVTX_VERSION and
1035
+ * NVTX_RESOURCE_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
1036
+ * and a versioned type (Method 2). Using a mix of the two methods
1037
+ * will likely cause either source level incompatibility or binary
1038
+ * incompatibility in the future.
1039
+ *
1040
+ * \par Settings Attribute Types and Values
1041
+ *
1042
+ *
1043
+ * \par Example:
1044
+ * \code
1045
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
1046
+ *
1047
+ * // Initialize
1048
+ * nvtxResourceAttributes_t attribs = {0};
1049
+ * attribs.version = NVTX_VERSION;
1050
+ * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
1051
+ *
1052
+ * // Configure the Attributes
1053
+ * attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
1054
+ * attribs.identifier.pValue = (const void*)pMutex;
1055
+ * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
1056
+ * attribs.message.ascii = "Single thread access to database.";
1057
+ *
1058
+ * nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
1059
+ * \endcode
1060
+ *
1061
+ * \sa
1062
+ * ::nvtxDomainResourceCreate
1063
+ */
1064
+ typedef struct nvtxResourceAttributes_v0
1065
+ {
1066
+ /**
1067
+ * \brief Version flag of the structure.
1068
+ *
1069
+ * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
1070
+ * supported in this header file. This can optionally be overridden to
1071
+ * another version of the tools extension library.
1072
+ */
1073
+ uint16_t version;
1074
+
1075
+ /**
1076
+ * \brief Size of the structure.
1077
+ *
1078
+ * Needs to be set to the size in bytes of this attribute
1079
+ * structure.
1080
+ */
1081
+ uint16_t size;
1082
+
1083
+ /**
1084
+ * \brief Identifier type specifies how to interpret the identifier field
1085
+ *
1086
+ * Defines the identifier format of the attribute structure's \ref RESOURCE_IDENTIFIER_FIELD
1087
+ * "identifier" field.
1088
+ *
1089
+ * Default Value is NVTX_RESOURCE_TYPE_UNKNOWN
1090
+ */
1091
+ int32_t identifierType; /* values from enums following the pattern nvtxResource[name]Type_t */
1092
+
1093
+ /**
1094
+ * \brief Identifier for the resource.
1095
+ * \anchor RESOURCE_IDENTIFIER_FIELD
1096
+ *
1097
+ * An identifier may be a pointer or a handle to an OS or middleware API object.
1098
+ * The resource type will assist in avoiding collisions where handles values may collide.
1099
+ */
1100
+ union identifier_t
1101
+ {
1102
+ const void* pValue;
1103
+ uint64_t ullValue;
1104
+ } identifier;
1105
+
1106
+ /** \brief Message type specified in this attribute structure.
1107
+ *
1108
+ * Defines the message format of the attribute structure's \ref RESOURCE_MESSAGE_FIELD
1109
+ * "message" field.
1110
+ *
1111
+ * Default Value is NVTX_MESSAGE_UNKNOWN
1112
+ */
1113
+ int32_t messageType; /* nvtxMessageType_t */
1114
+
1115
+ /** \brief Message assigned to this attribute structure. \anchor RESOURCE_MESSAGE_FIELD
1116
+ *
1117
+ * The text message that is attached to a resource.
1118
+ */
1119
+ nvtxMessageValue_t message;
1120
+
1121
+ } nvtxResourceAttributes_v0;
1122
+
1123
+ typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t;
1124
+
1125
+ /* \cond SHOW_HIDDEN
1126
+ * \version \NVTX_VERSION_2
1127
+ */
1128
+ #define NVTX_RESOURCE_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxResourceAttributes_v0) ) )
1129
+ typedef struct nvtxResourceHandle* nvtxResourceHandle_t;
1130
+ /** \endcond */
1131
+
1132
+
1133
+
1134
+ /* ------------------------------------------------------------------------- */
1135
+ /** \brief Create a resource object to track and associate data with OS and middleware objects
1136
+ *
1137
+ * Allows users to associate an API handle or pointer with a user-provided name.
1138
+ *
1139
+ *
1140
+ * \param domain - Domain to own the resource object
1141
+ * \param attribs - Attributes to be associated with the resource
1142
+ *
1143
+ * \return A handle that represents the newly created resource object.
1144
+ *
1145
+ * \par Example:
1146
+ * \code
1147
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
1148
+ * nvtxResourceAttributes_t attribs = {0};
1149
+ * attribs.version = NVTX_VERSION;
1150
+ * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
1151
+ * attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
1152
+ * attribs.identifier.pValue = (const void*)pMutex;
1153
+ * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
1154
+ * attribs.message.ascii = "Single thread access to database.";
1155
+ * nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
1156
+ * \endcode
1157
+ *
1158
+ * \sa
1159
+ * ::nvtxResourceAttributes_t
1160
+ * ::nvtxDomainResourceDestroy
1161
+ *
1162
+ * \version \NVTX_VERSION_2
1163
+ * @{ */
1164
+ NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
1165
+ /** @} */
1166
+
1167
+ /* ------------------------------------------------------------------------- */
1168
+ /** \brief Destroy a resource object to track and associate data with OS and middleware objects
1169
+ *
1170
+ * Allows users to associate an API handle or pointer with a user-provided name.
1171
+ *
1172
+ * \param resource - Handle to the resource in which to operate.
1173
+ *
1174
+ * \par Example:
1175
+ * \code
1176
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
1177
+ * nvtxResourceAttributes_t attribs = {0};
1178
+ * attribs.version = NVTX_VERSION;
1179
+ * attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
1180
+ * attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
1181
+ * attribs.identifier.pValue = (const void*)pMutex;
1182
+ * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
1183
+ * attribs.message.ascii = "Single thread access to database.";
1184
+ * nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
1185
+ * nvtxDomainResourceDestroy(handle);
1186
+ * \endcode
1187
+ *
1188
+ * \sa
1189
+ * ::nvtxDomainResourceCreate
1190
+ *
1191
+ * \version \NVTX_VERSION_2
1192
+ * @{ */
1193
+ NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource);
1194
+ /** @} */
1195
+
1196
+
1197
+ /** \name Functions for NVTX Category Naming*/
1198
+
1199
+ /* ------------------------------------------------------------------------- */
1200
+ /**
1201
+ * \brief Annotate an NVTX category used within a domain.
1202
+ *
1203
+ * Categories are used to group sets of events. Each category is identified
1204
+ * through a unique ID and that ID is passed into any of the marker/range
1205
+ * events to assign that event to a specific category. The nvtxDomainNameCategory
1206
+ * function calls allow the user to assign a name to a category ID that is
1207
+ * specific to the domain.
1208
+ *
1209
+ * nvtxDomainNameCategory(NULL, category, name) is equivalent to calling
1210
+ * nvtxNameCategory(category, name).
1211
+ *
1212
+ * \param domain - The domain of scoping the category.
1213
+ * \param category - The category ID to name.
1214
+ * \param name - The name of the category.
1215
+ *
1216
+ * \remarks The category names are tracked per domain.
1217
+ *
1218
+ * \par Example:
1219
+ * \code
1220
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("example");
1221
+ * nvtxDomainNameCategoryA(domain, 1, "Memory Allocation");
1222
+ * nvtxDomainNameCategoryW(domain, 2, L"Memory Transfer");
1223
+ * \endcode
1224
+ *
1225
+ * \version \NVTX_VERSION_2
1226
+ * @{ */
1227
+ NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name);
1228
+ NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
1229
+ /** @} */
1230
+
1231
+ /** \brief Annotate an NVTX category.
1232
+ *
1233
+ * Categories are used to group sets of events. Each category is identified
1234
+ * through a unique ID and that ID is passed into any of the marker/range
1235
+ * events to assign that event to a specific category. The nvtxNameCategory
1236
+ * function calls allow the user to assign a name to a category ID.
1237
+ *
1238
+ * \param category - The category ID to name.
1239
+ * \param name - The name of the category.
1240
+ *
1241
+ * \remarks The category names are tracked per process.
1242
+ *
1243
+ * \par Example:
1244
+ * \code
1245
+ * nvtxNameCategory(1, "Memory Allocation");
1246
+ * nvtxNameCategory(2, "Memory Transfer");
1247
+ * nvtxNameCategory(3, "Memory Object Lifetime");
1248
+ * \endcode
1249
+ *
1250
+ * \version \NVTX_VERSION_1
1251
+ * @{ */
1252
+ NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name);
1253
+ NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name);
1254
+ /** @} */
1255
+
1256
+ /** \name Functions for OS Threads Naming*/
1257
+
1258
+ /* ------------------------------------------------------------------------- */
1259
+ /** \brief Annotate an OS thread.
1260
+ *
1261
+ * Allows the user to name an active thread of the current process. If an
1262
+ * invalid thread ID is provided or a thread ID from a different process is
1263
+ * used the behavior of the tool is implementation dependent.
1264
+ *
1265
+ * Tools expect thread ID to be a number that uniquely identifies the thread
1266
+ * at the time of the call. Note that a thread's ID can be reused after
1267
+ * it is destroyed. Tools may choose how to handle aliasing of thread IDs.
1268
+ *
1269
+ * POSIX pthread_t type returned by pthread_self() may not comply with these
1270
+ * expectations. Please use OS-specific thread ID instead of pthread_t.
1271
+ *
1272
+ * The thread name is associated to the default domain. To support domains
1273
+ * use resource objects via ::nvtxDomainResourceCreate.
1274
+ *
1275
+ * \param threadId - The ID of the thread to name.
1276
+ * \param name - The name of the thread.
1277
+ *
1278
+ * \par Examples:
1279
+ * MS Windows:
1280
+ * \code
1281
+ * #include <windows.h>
1282
+ * nvtxNameOsThread(GetCurrentThreadId(), "Current thread");
1283
+ * nvtxNameOsThread(GetThreadId(SomeThreadHandle), "Other thread");
1284
+ * \endcode
1285
+ *
1286
+ * Android:
1287
+ * \code
1288
+ * #include <unistd.h>
1289
+ * nvtxNameOsThreadA(gettid(), "Current thread");
1290
+ * nvtxNameOsThreadA(getpid(), "Main thread");
1291
+ * \endcode
1292
+ *
1293
+ * Linux:
1294
+ * \code
1295
+ * #include <sys/syscall.h>
1296
+ * nvtxNameOsThreadA(syscall(SYS_gettid), "Current thread");
1297
+ * \endcode
1298
+ * \code
1299
+ * #include <unistd.h>
1300
+ * nvtxNameOsThreadA(getpid(), "Main thread");
1301
+ * \endcode
1302
+ *
1303
+ * OS X:
1304
+ * \code
1305
+ * #include <sys/syscall.h>
1306
+ * nvtxNameOsThreadA(syscall(SYS_thread_selfid), "Current thread");
1307
+ * \endcode
1308
+ * \code
1309
+ * #include <pthread.h>
1310
+ * __uint64_t id;
1311
+ * pthread_threadid_np(pthread_self(), &id);
1312
+ * nvtxNameOsThreadA(id, "Current thread");
1313
+ * pthread_threadid_np(somePThreadId, &id);
1314
+ * nvtxNameOsThreadA(id, "Other thread");
1315
+ * \endcode
1316
+ *
1317
+ * \version \NVTX_VERSION_1
1318
+ * @{ */
1319
+ NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name);
1320
+ NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name);
1321
+ /** @} */
1322
+
1323
+
1324
+ /** @} */ /*END defgroup*/
1325
+ /* ========================================================================= */
1326
+ /** \defgroup STRING_REGISTRATION String Registration
1327
+ *
1328
+ * Registered strings are intended to increase performance by lowering instrumentation
1329
+ * overhead. String may be registered once and the handle may be passed in place of
1330
+ * a string where an the APIs may allow.
1331
+ *
1332
+ * See \ref STRING_REGISTRATION for more details
1333
+ *
1334
+ * @{
1335
+ */
1336
+
1337
+ /* ------------------------------------------------------------------------- */
1338
+ /** \brief Register a string.
1339
+
1340
+ * Registers an immutable string with NVTX. Once registered the pointer used
1341
+ * to register the domain name can be used in nvtxEventAttributes_t
1342
+ * \ref MESSAGE_FIELD. This allows NVTX implementation to skip copying the
1343
+ * contents of the message on each event invocation.
1344
+ *
1345
+ * String registration is an optimization. It is recommended to use string
1346
+ * registration if the string will be passed to an event many times.
1347
+ *
1348
+ * String are not unregistered, except that by unregistering the entire domain
1349
+ *
1350
+ * \param domain - Domain handle. If NULL then the global domain is used.
1351
+ * \param string - A unique pointer to a sequence of characters.
1352
+ *
1353
+ * \return A handle representing the registered string.
1354
+ *
1355
+ * \par Example:
1356
+ * \code
1357
+ * nvtxDomainCreateA("com.nvidia.nvtx.example");
1358
+ * nvtxStringHandle_t message = nvtxDomainRegisterStringA(domain, "registered string");
1359
+ * nvtxEventAttributes_t eventAttrib = {0};
1360
+ * eventAttrib.version = NVTX_VERSION;
1361
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
1362
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
1363
+ * eventAttrib.message.registered = message;
1364
+ * \endcode
1365
+ *
1366
+ * \version \NVTX_VERSION_2
1367
+ * @{ */
1368
+ NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string);
1369
+ NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string);
1370
+ /** @} */
1371
+
1372
+ /** @} */ /*END defgroup*/
1373
+ /* ========================================================================= */
1374
+ /** \defgroup DOMAINS Domains
1375
+ *
1376
+ * Domains are used to group events to a developer defined scope. Middleware
1377
+ * vendors may also scope their own events to avoid collisions with the
1378
+ * the application developer's events, so that the application developer may
1379
+ * inspect both parts and easily differentiate or filter them. By default
1380
+ * all events are scoped to a global domain where NULL is provided or when
1381
+ * using APIs provided b versions of NVTX below v2
1382
+ *
1383
+ * Domains are intended to be typically long lived objects with the intention
1384
+ * of logically separating events of large modules from each other such as
1385
+ * middleware libraries from each other and the main application.
1386
+ *
1387
+ * See \ref DOMAINS for more details
1388
+ *
1389
+ * @{
1390
+ */
1391
+
1392
+ /* ------------------------------------------------------------------------- */
1393
+ /** \brief Register a NVTX domain.
1394
+ *
1395
+ * Domains are used to scope annotations. All NVTX_VERSION_0 and NVTX_VERSION_1
1396
+ * annotations are scoped to the global domain. The function nvtxDomainCreate
1397
+ * creates a new named domain.
1398
+ *
1399
+ * Each domain maintains its own nvtxRangePush and nvtxRangePop stack.
1400
+ *
1401
+ * \param name - A unique string representing the domain.
1402
+ *
1403
+ * \return A handle representing the domain.
1404
+ *
1405
+ * \par Example:
1406
+ * \code
1407
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example");
1408
+ *
1409
+ * nvtxMarkA("nvtxMarkA to global domain");
1410
+ *
1411
+ * nvtxEventAttributes_t eventAttrib1 = {0};
1412
+ * eventAttrib1.version = NVTX_VERSION;
1413
+ * eventAttrib1.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
1414
+ * eventAttrib1.message.ascii = "nvtxDomainMarkEx to global domain";
1415
+ * nvtxDomainMarkEx(NULL, &eventAttrib1);
1416
+ *
1417
+ * nvtxEventAttributes_t eventAttrib2 = {0};
1418
+ * eventAttrib2.version = NVTX_VERSION;
1419
+ * eventAttrib2.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
1420
+ * eventAttrib2.message.ascii = "nvtxDomainMarkEx to com.nvidia.nvtx.example";
1421
+ * nvtxDomainMarkEx(domain, &eventAttrib2);
1422
+ * nvtxDomainDestroy(domain);
1423
+ * \endcode
1424
+ *
1425
+ * \sa
1426
+ * ::nvtxDomainDestroy
1427
+ *
1428
+ * \version \NVTX_VERSION_2
1429
+ * @{ */
1430
+ NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* name);
1431
+ NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* name);
1432
+ /** @} */
1433
+
1434
+ /* ------------------------------------------------------------------------- */
1435
+ /** \brief Unregister a NVTX domain.
1436
+ *
1437
+ * Unregisters the domain handle and frees all domain specific resources.
1438
+ *
1439
+ * \param domain - the domain handle
1440
+ *
1441
+ * \par Example:
1442
+ * \code
1443
+ * nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example");
1444
+ * nvtxDomainDestroy(domain);
1445
+ * \endcode
1446
+ *
1447
+ * \sa
1448
+ * ::nvtxDomainCreateA
1449
+ * ::nvtxDomainCreateW
1450
+ *
1451
+ * \version \NVTX_VERSION_2
1452
+ * @{ */
1453
+ NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain);
1454
+ /** @} */
1455
+
1456
+
1457
+ /** @} */ /*END defgroup*/
1458
+ /* ========================================================================= */
1459
+ /** \cond SHOW_HIDDEN */
1460
+
1461
+ #ifdef UNICODE
1462
+ #define nvtxMark nvtxMarkW
1463
+ #define nvtxRangeStart nvtxRangeStartW
1464
+ #define nvtxRangePush nvtxRangePushW
1465
+ #define nvtxNameCategory nvtxNameCategoryW
1466
+ #define nvtxNameOsThread nvtxNameOsThreadW
1467
+ /* NVTX_VERSION_2 */
1468
+ #define nvtxDomainCreate nvtxDomainCreateW
1469
+ #define nvtxDomainRegisterString nvtxDomainRegisterStringW
1470
+ #define nvtxDomainNameCategory nvtxDomainNameCategoryW
1471
+ #else
1472
+ #define nvtxMark nvtxMarkA
1473
+ #define nvtxRangeStart nvtxRangeStartA
1474
+ #define nvtxRangePush nvtxRangePushA
1475
+ #define nvtxNameCategory nvtxNameCategoryA
1476
+ #define nvtxNameOsThread nvtxNameOsThreadA
1477
+ /* NVTX_VERSION_2 */
1478
+ #define nvtxDomainCreate nvtxDomainCreateA
1479
+ #define nvtxDomainRegisterString nvtxDomainRegisterStringA
1480
+ #define nvtxDomainNameCategory nvtxDomainNameCategoryA
1481
+ #endif
1482
+
1483
+ /** \endcond */
1484
+
1485
+ #ifdef __cplusplus
1486
+ } /* extern "C" */
1487
+ #endif /* __cplusplus */
1488
+
1489
+ #define NVTX_IMPL_GUARD /* Ensure other headers cannot included directly */
1490
+
1491
+ #include "nvtxDetail/nvtxTypes.h"
1492
+
1493
+ #ifndef NVTX_NO_IMPL
1494
+ #include "nvtxDetail/nvtxImpl.h"
1495
+ #endif /*NVTX_NO_IMPL*/
1496
+
1497
+ #undef NVTX_IMPL_GUARD
1498
+
1499
+ #endif /* !defined(NVTX_VERSION) */
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtCuda.h ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2016 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ #include "nvToolsExt.h"
39
+
40
+ #include "cuda.h"
41
+
42
+ #ifndef NVTOOLSEXT_CUDA_V3
43
+ #define NVTOOLSEXT_CUDA_V3
44
+
45
+ #ifdef __cplusplus
46
+ extern "C" {
47
+ #endif /* __cplusplus */
48
+
49
+ /* ========================================================================= */
50
+ /** \name Functions for CUDA Resource Naming
51
+ */
52
+ /** \addtogroup RESOURCE_NAMING
53
+ * \section RESOURCE_NAMING_CUDA CUDA Resource Naming
54
+ *
55
+ * This section covers the API functions that allow to annotate CUDA resources
56
+ * with user-provided names.
57
+ *
58
+ * @{
59
+ */
60
+
61
+ /* ------------------------------------------------------------------------- */
62
+ /* \cond SHOW_HIDDEN
63
+ * \brief Used to build a non-colliding value for resource types separated class
64
+ * \version \NVTX_VERSION_2
65
+ */
66
+ #define NVTX_RESOURCE_CLASS_CUDA 4
67
+ /** \endcond */
68
+
69
+ /* ------------------------------------------------------------------------- */
70
+ /** \brief Resource types for CUDA
71
+ */
72
+ typedef enum nvtxResourceCUDAType_t
73
+ {
74
+ NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
75
+ NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
76
+ NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
77
+ NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
78
+ } nvtxResourceCUDAType_t;
79
+
80
+
81
+ /* ------------------------------------------------------------------------- */
82
+ /** \brief Annotates a CUDA device.
83
+ *
84
+ * Allows the user to associate a CUDA device with a user-provided name.
85
+ *
86
+ * \param device - The handle of the CUDA device to name.
87
+ * \param name - The name of the CUDA device.
88
+ *
89
+ * \version \NVTX_VERSION_1
90
+ * @{ */
91
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
92
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
93
+ /** @} */
94
+
95
+ /* ------------------------------------------------------------------------- */
96
+ /** \brief Annotates a CUDA context.
97
+ *
98
+ * Allows the user to associate a CUDA context with a user-provided name.
99
+ *
100
+ * \param context - The handle of the CUDA context to name.
101
+ * \param name - The name of the CUDA context.
102
+ *
103
+ * \par Example:
104
+ * \code
105
+ * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
106
+ * if ( CUDA_SUCCESS != status )
107
+ * goto Error;
108
+ * nvtxNameCuContext(cuContext, "CTX_NAME");
109
+ * \endcode
110
+ *
111
+ * \version \NVTX_VERSION_1
112
+ * @{ */
113
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
114
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
115
+ /** @} */
116
+
117
+ /* ------------------------------------------------------------------------- */
118
+ /** \brief Annotates a CUDA stream.
119
+ *
120
+ * Allows the user to associate a CUDA stream with a user-provided name.
121
+ *
122
+ * \param stream - The handle of the CUDA stream to name.
123
+ * \param name - The name of the CUDA stream.
124
+ *
125
+ * \version \NVTX_VERSION_1
126
+ * @{ */
127
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
128
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
129
+ /** @} */
130
+
131
+ /* ------------------------------------------------------------------------- */
132
+ /** \brief Annotates a CUDA event.
133
+ *
134
+ * Allows the user to associate a CUDA event with a user-provided name.
135
+ *
136
+ * \param event - The handle of the CUDA event to name.
137
+ * \param name - The name of the CUDA event.
138
+ *
139
+ * \version \NVTX_VERSION_1
140
+ * @{ */
141
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
142
+ NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
143
+ /** @} */
144
+
145
+ /** @} */ /* END RESOURCE_NAMING */
146
+
147
+ /* ========================================================================= */
148
+ #ifdef UNICODE
149
+ #define nvtxNameCuDevice nvtxNameCuDeviceW
150
+ #define nvtxNameCuContext nvtxNameCuContextW
151
+ #define nvtxNameCuStream nvtxNameCuStreamW
152
+ #define nvtxNameCuEvent nvtxNameCuEventW
153
+ #else
154
+ #define nvtxNameCuDevice nvtxNameCuDeviceA
155
+ #define nvtxNameCuContext nvtxNameCuContextA
156
+ #define nvtxNameCuStream nvtxNameCuStreamA
157
+ #define nvtxNameCuEvent nvtxNameCuEventA
158
+ #endif
159
+
160
+ #ifdef __cplusplus
161
+ }
162
+ #endif /* __cplusplus */
163
+
164
+ #ifndef NVTX_NO_IMPL
165
+ #define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
166
+ #include "nvtxDetail/nvtxImplCuda_v3.h"
167
+ #undef NVTX_IMPL_GUARD_CUDA
168
+ #endif /*NVTX_NO_IMPL*/
169
+
170
+ #endif /* NVTOOLSEXT_CUDA_V3 */
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtCudaRt.h ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2016 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ #include "nvToolsExt.h"
39
+
40
+ #include "cuda.h"
41
+ #include "driver_types.h"
42
+
43
+ #ifndef NVTOOLSEXT_CUDART_V3
44
+ #define NVTOOLSEXT_CUDART_V3
45
+
46
+ #ifdef __cplusplus
47
+ extern "C" {
48
+ #endif /* __cplusplus */
49
+
50
+ /* ========================================================================= */
51
+ /** \name Functions for CUDA Resource Naming
52
+ */
53
+ /** \addtogroup RESOURCE_NAMING
54
+ * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
55
+ *
56
+ * This section covers the API functions that allow to annotate CUDA resources
57
+ * with user-provided names.
58
+ *
59
+ * @{
60
+ */
61
+
62
+ /* ------------------------------------------------------------------------- */
63
+ /* \cond SHOW_HIDDEN
64
+ * \brief Used to build a non-colliding value for resource types separated class
65
+ * \version \NVTX_VERSION_2
66
+ */
67
+ #define NVTX_RESOURCE_CLASS_CUDART 5
68
+ /** \endcond */
69
+
70
+ /* ------------------------------------------------------------------------- */
71
+ /** \brief Resource types for CUDART
72
+ */
73
+ typedef enum nvtxResourceCUDARTType_t
74
+ {
75
+ NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
76
+ NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
77
+ NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
78
+ } nvtxResourceCUDARTType_t;
79
+
80
+
81
+ /* ------------------------------------------------------------------------- */
82
+ /** \brief Annotates a CUDA device.
83
+ *
84
+ * Allows the user to associate a CUDA device with a user-provided name.
85
+ *
86
+ * \param device - The id of the CUDA device to name.
87
+ * \param name - The name of the CUDA device.
88
+ *
89
+ * \version \NVTX_VERSION_1
90
+ * @{ */
91
+ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
92
+ NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
93
+ /** @} */
94
+
95
+ /* ------------------------------------------------------------------------- */
96
+ /** \brief Annotates a CUDA stream.
97
+ *
98
+ * Allows the user to associate a CUDA stream with a user-provided name.
99
+ *
100
+ * \param stream - The handle of the CUDA stream to name.
101
+ * \param name - The name of the CUDA stream.
102
+ *
103
+ * \version \NVTX_VERSION_1
104
+ * @{ */
105
+ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
106
+ NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
107
+ /** @} */
108
+
109
+ /* ------------------------------------------------------------------------- */
110
+ /** \brief Annotates a CUDA event.
111
+ *
112
+ * Allows the user to associate a CUDA event with a user-provided name.
113
+ *
114
+ * \param event - The handle of the CUDA event to name.
115
+ * \param name - The name of the CUDA event.
116
+ *
117
+ * \version \NVTX_VERSION_1
118
+ * @{ */
119
+ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
120
+ NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
121
+ /** @} */
122
+
123
+ /** @} */ /* END RESOURCE_NAMING */
124
+
125
+ /* ========================================================================= */
126
+ #ifdef UNICODE
127
+ #define nvtxNameCudaDevice nvtxNameCudaDeviceW
128
+ #define nvtxNameCudaStream nvtxNameCudaStreamW
129
+ #define nvtxNameCudaEvent nvtxNameCudaEventW
130
+ #else
131
+ #define nvtxNameCudaDevice nvtxNameCudaDeviceA
132
+ #define nvtxNameCudaStream nvtxNameCudaStreamA
133
+ #define nvtxNameCudaEvent nvtxNameCudaEventA
134
+ #endif
135
+
136
+ #ifdef __cplusplus
137
+ }
138
+ #endif /* __cplusplus */
139
+
140
+ #ifndef NVTX_NO_IMPL
141
+ #define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
142
+ #include "nvtxDetail/nvtxImplCudaRt_v3.h"
143
+ #undef NVTX_IMPL_GUARD_CUDART
144
+ #endif /*NVTX_NO_IMPL*/
145
+
146
+ #endif /* NVTOOLSEXT_CUDART_V3 */
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtOpenCL.h ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2016 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ #include "nvToolsExt.h"
39
+
40
+ #include <CL/cl.h>
41
+
42
+ #ifndef NVTOOLSEXT_OPENCL_V3
43
+ #define NVTOOLSEXT_OPENCL_V3
44
+
45
+ #ifdef __cplusplus
46
+ extern "C" {
47
+ #endif /* __cplusplus */
48
+
49
+ /* ========================================================================= */
50
+ /** \name Functions for OpenCL Resource Naming
51
+ */
52
+ /** \addtogroup RESOURCE_NAMING
53
+ * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
54
+ *
55
+ * This section covers the API functions that allow to annotate OpenCL resources
56
+ * with user-provided names.
57
+ *
58
+ * @{
59
+ */
60
+
61
+ /* ------------------------------------------------------------------------- */
62
+ /* \cond SHOW_HIDDEN
63
+ * \brief Used to build a non-colliding value for resource types separated class
64
+ * \version \NVTX_VERSION_2
65
+ */
66
+ #define NVTX_RESOURCE_CLASS_OPENCL 6
67
+ /** \endcond */
68
+
69
+ /* ------------------------------------------------------------------------- */
70
+ /** \brief Resource types for OpenCL
71
+ */
72
+ typedef enum nvtxResourceOpenCLType_t
73
+ {
74
+ NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
75
+ NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
76
+ NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
77
+ NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
78
+ NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
79
+ NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
80
+ NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7),
81
+ } nvtxResourceOpenCLType_t;
82
+
83
+
84
+ /* ------------------------------------------------------------------------- */
85
+ /** \brief Annotates an OpenCL device.
86
+ *
87
+ * Allows to associate an OpenCL device with a user-provided name.
88
+ *
89
+ * \param device - The handle of the OpenCL device to name.
90
+ * \param name - The name of the OpenCL device.
91
+ *
92
+ * \version \NVTX_VERSION_1
93
+ * @{ */
94
+ NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
95
+ NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
96
+ /** @} */
97
+
98
+ /* ------------------------------------------------------------------------- */
99
+ /** \brief Annotates an OpenCL context.
100
+ *
101
+ * Allows to associate an OpenCL context with a user-provided name.
102
+ *
103
+ * \param context - The handle of the OpenCL context to name.
104
+ * \param name - The name of the OpenCL context.
105
+ *
106
+ * \version \NVTX_VERSION_1
107
+ * @{ */
108
+ NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
109
+ NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
110
+ /** @} */
111
+
112
+ /* ------------------------------------------------------------------------- */
113
+ /** \brief Annotates an OpenCL command queue.
114
+ *
115
+ * Allows to associate an OpenCL command queue with a user-provided name.
116
+ *
117
+ * \param command_queue - The handle of the OpenCL command queue to name.
118
+ * \param name - The name of the OpenCL command queue.
119
+ *
120
+ * \version \NVTX_VERSION_1
121
+ * @{ */
122
+ NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
123
+ NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
124
+ /** @} */
125
+
126
+ /* ------------------------------------------------------------------------- */
127
+ /** \brief Annotates an OpenCL memory object.
128
+ *
129
+ * Allows to associate an OpenCL memory object with a user-provided name.
130
+ *
131
+ * \param memobj - The handle of the OpenCL memory object to name.
132
+ * \param name - The name of the OpenCL memory object.
133
+ *
134
+ * \version \NVTX_VERSION_1
135
+ * @{ */
136
+ NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
137
+ NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
138
+ /** @} */
139
+
140
+ /* ------------------------------------------------------------------------- */
141
+ /** \brief Annotates an OpenCL sampler.
142
+ *
143
+ * Allows to associate an OpenCL sampler with a user-provided name.
144
+ *
145
+ * \param sampler - The handle of the OpenCL sampler to name.
146
+ * \param name - The name of the OpenCL sampler.
147
+ *
148
+ * \version \NVTX_VERSION_1
149
+ * @{ */
150
+ NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
151
+ NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
152
+ /** @} */
153
+
154
+ /* ------------------------------------------------------------------------- */
155
+ /** \brief Annotates an OpenCL program.
156
+ *
157
+ * Allows to associate an OpenCL program with a user-provided name.
158
+ *
159
+ * \param program - The handle of the OpenCL program to name.
160
+ * \param name - The name of the OpenCL program.
161
+ *
162
+ * \code
163
+ * cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
164
+ * (const char **) &cSourceCL, &program_length, &ciErrNum);
165
+ * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
166
+ * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
167
+ * \endcode
168
+ *
169
+ * \version \NVTX_VERSION_1
170
+ * @{ */
171
+ NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
172
+ NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
173
+ /** @} */
174
+
175
+ /* ------------------------------------------------------------------------- */
176
+ /** \brief Annotates an OpenCL event.
177
+ *
178
+ * Allows to associate an OpenCL event with a user-provided name.
179
+ *
180
+ * \param evnt - The handle of the OpenCL event to name.
181
+ * \param name - The name of the OpenCL event.
182
+ *
183
+ * \version \NVTX_VERSION_1
184
+ * @{ */
185
+ NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
186
+ NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
187
+ /** @} */
188
+
189
+ /** @} */ /* END RESOURCE_NAMING */
190
+
191
+ /* ========================================================================= */
192
+ #ifdef UNICODE
193
+ #define nvtxNameClDevice nvtxNameClDeviceW
194
+ #define nvtxNameClContext nvtxNameClContextW
195
+ #define nvtxNameClCommandQueue nvtxNameClCommandQueueW
196
+ #define nvtxNameClMemObject nvtxNameClMemObjectW
197
+ #define nvtxNameClSampler nvtxNameClSamplerW
198
+ #define nvtxNameClProgram nvtxNameClProgramW
199
+ #define nvtxNameClEvent nvtxNameClEventW
200
+ #else
201
+ #define nvtxNameClDevice nvtxNameClDeviceA
202
+ #define nvtxNameClContext nvtxNameClContextA
203
+ #define nvtxNameClCommandQueue nvtxNameClCommandQueueA
204
+ #define nvtxNameClMemObject nvtxNameClMemObjectA
205
+ #define nvtxNameClSampler nvtxNameClSamplerA
206
+ #define nvtxNameClProgram nvtxNameClProgramA
207
+ #define nvtxNameClEvent nvtxNameClEventA
208
+ #endif
209
+
210
+ #ifdef __cplusplus
211
+ }
212
+ #endif /* __cplusplus */
213
+
214
+ #ifndef NVTX_NO_IMPL
215
+ #define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
216
+ #include "nvtxDetail/nvtxImplOpenCL_v3.h"
217
+ #undef NVTX_IMPL_GUARD_OPENCL
218
+ #endif /*NVTX_NO_IMPL*/
219
+
220
+ #endif /* NVTOOLSEXT_OPENCL_V3 */
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtSync.h ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2009-2016 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO USER:
5
+ *
6
+ * This source code is subject to NVIDIA ownership rights under U.S. and
7
+ * international Copyright laws.
8
+ *
9
+ * This software and the information contained herein is PROPRIETARY and
10
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
11
+ * of a form of NVIDIA software license agreement.
12
+ *
13
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
14
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
15
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
16
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
17
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
18
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
19
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
20
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
21
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
22
+ * OR PERFORMANCE OF THIS SOURCE CODE.
23
+ *
24
+ * U.S. Government End Users. This source code is a "commercial item" as
25
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
26
+ * "commercial computer software" and "commercial computer software
27
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
28
+ * and is provided to the U.S. Government only as a commercial end item.
29
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
30
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
31
+ * source code with only those rights set forth herein.
32
+ *
33
+ * Any use of this source code in individual and commercial software must
34
+ * include, in the user documentation and internal comments to the code,
35
+ * the above Disclaimer and U.S. Government End Users Notice.
36
+ */
37
+
38
+ #include "nvToolsExt.h"
39
+
40
+ #ifndef NVTOOLSEXT_SYNC_V3
41
+ #define NVTOOLSEXT_SYNC_V3
42
+
43
+ #ifdef __cplusplus
44
+ extern "C" {
45
+ #endif /* __cplusplus */
46
+
47
+ /* \cond SHOW_HIDDEN
48
+ * \version \NVTX_VERSION_2
49
+ */
50
+ #define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
51
+ /** \endcond */
52
+
53
+
54
+ /**
55
+ * \page PAGE_SYNCHRONIZATION Synchronization
56
+ *
57
+ * This section covers a subset of the API that allow users to track additional
58
+ * synchronization details of their application. Naming OS synchronization primitives
59
+ * may allow users to better understand the data collected by traced synchronization
60
+ * APIs. Additionally, a user defined synchronization object can allow the users to
61
+ * to tell the tools when the user is building their own synchronization system
62
+ * that do not rely on the OS to provide behaviors and instead use techniques like
63
+ * atomic operations and spinlocks.
64
+ *
65
+ * See module \ref SYNCHRONIZATION for details.
66
+ *
67
+ * \par Example:
68
+ * \code
69
+ * class MyMutex
70
+ * {
71
+ * volatile long bLocked;
72
+ * nvtxSyncUser_t hSync;
73
+ * public:
74
+ * MyMutex(const char* name, nvtxDomainHandle_t d){
75
+ * bLocked = 0;
76
+ *
77
+ * nvtxSyncUserAttributes_t attribs = { 0 };
78
+ * attribs.version = NVTX_VERSION;
79
+ * attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
80
+ * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
81
+ * attribs.message.ascii = name;
82
+ * hSync = nvtxDomainSyncUserCreate(d, &attribs);
83
+ * }
84
+ *
85
+ * ~MyMutex() {
86
+ * nvtxDomainSyncUserDestroy(hSync);
87
+ * }
88
+ *
89
+ * bool Lock() {
90
+ * nvtxDomainSyncUserAcquireStart(hSync);
91
+ * bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
92
+
93
+ * if (acquired) {
94
+ * nvtxDomainSyncUserAcquireSuccess(hSync);
95
+ * }
96
+ * else {
97
+ * nvtxDomainSyncUserAcquireFailed(hSync);
98
+ * }
99
+ * return acquired;
100
+ * }
101
+
102
+ * void Unlock() {
103
+ * nvtxDomainSyncUserReleasing(hSync);
104
+ * bLocked = false;
105
+ * }
106
+ * };
107
+ * \endcode
108
+ *
109
+ * \version \NVTX_VERSION_2
110
+ */
111
+
112
+ /* ------------------------------------------------------------------------- */
113
+ /* \cond SHOW_HIDDEN
114
+ * \brief Used to build a non-colliding value for resource types separated class
115
+ * \version \NVTX_VERSION_2
116
+ */
117
+ #define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */
118
+ #define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/
119
+ /** \endcond */
120
+
121
+
122
+ /* ------------------------------------------------------------------------- */
123
+ /** \defgroup SYNCHRONIZATION Synchronization
124
+ * See page \ref PAGE_SYNCHRONIZATION.
125
+ * @{
126
+ */
127
+
128
+ /** \brief Resource type values for OSs with POSIX Thread API support
129
+ */
130
+ typedef enum nvtxResourceSyncPosixThreadType_t
131
+ {
132
+ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t */
133
+ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t */
134
+ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t */
135
+ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t */
136
+ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t */
137
+ NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t */
138
+ } nvtxResourceSyncPosixThreadType_t;
139
+
140
+ /** \brief Resource type values for Windows OSs
141
+ */
142
+ typedef enum nvtxResourceSyncWindowsType_t
143
+ {
144
+ NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
145
+ NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
146
+ NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
147
+ NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
148
+ NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5)
149
+ } nvtxResourceSyncWindowsType_t;
150
+
151
+ /** \brief Resource type values for Linux and Linux derived OSs such as Android
152
+ * \sa
153
+ * ::nvtxResourceSyncPosixThreadType_t
154
+ */
155
+ typedef enum nvtxResourceSyncLinuxType_t
156
+ {
157
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
158
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
159
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
160
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
161
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5),
162
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6),
163
+ NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7)
164
+ } nvtxResourceSyncLinuxType_t;
165
+
166
+ /** \brief Resource type values for Android come from Linux.
167
+ * \sa
168
+ * ::nvtxResourceSyncLinuxType_t
169
+ * ::nvtxResourceSyncPosixThreadType_t
170
+ */
171
+ typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t;
172
+
173
+ /** \brief User Defined Synchronization Object Handle .
174
+ * \anchor SYNCUSER_HANDLE_STRUCTURE
175
+ *
176
+ * This structure is opaque to the user and is used as a handle to reference
177
+ * a user defined syncrhonization object. The tools will return a pointer through the API for the application
178
+ * to hold on it's behalf to reference the string in the future.
179
+ *
180
+ */
181
+ typedef struct nvtxSyncUser* nvtxSyncUser_t;
182
+
183
+ /** \brief User Defined Synchronization Object Attributes Structure.
184
+ * \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
185
+ *
186
+ * This structure is used to describe the attributes of a user defined synchronization
187
+ * object. The layout of the structure is defined by a specific version of the tools
188
+ * extension library and can change between different versions of the Tools Extension
189
+ * library.
190
+ *
191
+ * \par Initializing the Attributes
192
+ *
193
+ * The caller should always perform the following three tasks when using
194
+ * attributes:
195
+ * <ul>
196
+ * <li>Zero the structure
197
+ * <li>Set the version field
198
+ * <li>Set the size field
199
+ * </ul>
200
+ *
201
+ * Zeroing the structure sets all the event attributes types and values
202
+ * to the default value.
203
+ *
204
+ * The version and size field are used by the Tools Extension
205
+ * implementation to handle multiple versions of the attributes structure.
206
+ *
207
+ * It is recommended that the caller use one of the following to methods
208
+ * to initialize the event attributes structure:
209
+ *
210
+ * \par Method 1: Initializing nvtxEventAttributes for future compatibility
211
+ * \code
212
+ * nvtxSyncUserAttributes_t attribs = {0};
213
+ * attribs.version = NVTX_VERSION;
214
+ * attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
215
+ * \endcode
216
+ *
217
+ * \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version
218
+ * \code
219
+ * nvtxSyncUserAttributes_t attribs = {0};
220
+ * attribs.version = 1;
221
+ * attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t));
222
+ * \endcode
223
+ *
224
+ * If the caller uses Method 1 it is critical that the entire binary
225
+ * layout of the structure be configured to 0 so that all fields
226
+ * are initialized to the default value.
227
+ *
228
+ * The caller should either use both NVTX_VERSION and
229
+ * NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
230
+ * and a versioned type (Method 2). Using a mix of the two methods
231
+ * will likely cause either source level incompatibility or binary
232
+ * incompatibility in the future.
233
+ *
234
+ * \par Settings Attribute Types and Values
235
+ *
236
+ *
237
+ * \par Example:
238
+ * \code
239
+ * // Initialize
240
+ * nvtxSyncUserAttributes_t attribs = {0};
241
+ * attribs.version = NVTX_VERSION;
242
+ * attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
243
+ *
244
+ * // Configure the Attributes
245
+ * attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
246
+ * attribs.message.ascii = "Example";
247
+ * \endcode
248
+ *
249
+ * \sa
250
+ * ::nvtxDomainSyncUserCreate
251
+ */
252
+ typedef struct nvtxSyncUserAttributes_v0
253
+ {
254
+ /**
255
+ * \brief Version flag of the structure.
256
+ *
257
+ * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
258
+ * supported in this header file. This can optionally be overridden to
259
+ * another version of the tools extension library.
260
+ */
261
+ uint16_t version;
262
+
263
+ /**
264
+ * \brief Size of the structure.
265
+ *
266
+ * Needs to be set to the size in bytes of the event attribute
267
+ * structure used to specify the event.
268
+ */
269
+ uint16_t size;
270
+
271
+ /** \brief Message type specified in this attribute structure.
272
+ *
273
+ * Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message
274
+ * "message" field.
275
+ *
276
+ * Default Value is NVTX_MESSAGE_UNKNOWN
277
+ */
278
+ int32_t messageType; /* nvtxMessageType_t */
279
+
280
+ /** \brief Message assigned to this attribute structure.
281
+ *
282
+ * The text message that is attached to an event.
283
+ */
284
+ nvtxMessageValue_t message;
285
+
286
+ } nvtxSyncUserAttributes_v0;
287
+
288
+ typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
289
+
290
+ /* ------------------------------------------------------------------------- */
291
+ /** \brief Create a user defined synchronization object
292
+ * This is used to track non-OS synchronization working with spinlocks and atomics
293
+ *
294
+ * \param domain - Domain to own the resource
295
+ * \param attribs - A structure to assign multiple attributes to the object.
296
+ *
297
+ * \return A handle that represents the newly created user defined synchronization object.
298
+ *
299
+ * \sa
300
+ * ::nvtxDomainSyncUserCreate
301
+ * ::nvtxDomainSyncUserDestroy
302
+ * ::nvtxDomainSyncUserAcquireStart
303
+ * ::nvtxDomainSyncUserAcquireFailed
304
+ * ::nvtxDomainSyncUserAcquireSuccess
305
+ * ::nvtxDomainSyncUserReleasing
306
+ *
307
+ * \version \NVTX_VERSION_2
308
+ */
309
+ NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
310
+
311
+ /* ------------------------------------------------------------------------- */
312
+ /** \brief Destroy a user defined synchronization object
313
+ * This is used to track non-OS synchronization working with spinlocks and atomics
314
+ *
315
+ * \param handle - A handle to the object to operate on.
316
+ *
317
+ * \sa
318
+ * ::nvtxDomainSyncUserCreate
319
+ * ::nvtxDomainSyncUserDestroy
320
+ * ::nvtxDomainSyncUserAcquireStart
321
+ * ::nvtxDomainSyncUserAcquireFailed
322
+ * ::nvtxDomainSyncUserAcquireSuccess
323
+ * ::nvtxDomainSyncUserReleasing
324
+ *
325
+ * \version \NVTX_VERSION_2
326
+ */
327
+ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle);
328
+
329
+ /* ------------------------------------------------------------------------- */
330
+ /** \brief Signal to tools that an attempt to acquire a user defined synchronization object
331
+ *
332
+ * \param handle - A handle to the object to operate on.
333
+ *
334
+ * \sa
335
+ * ::nvtxDomainSyncUserCreate
336
+ * ::nvtxDomainSyncUserDestroy
337
+ * ::nvtxDomainSyncUserAcquireStart
338
+ * ::nvtxDomainSyncUserAcquireFailed
339
+ * ::nvtxDomainSyncUserAcquireSuccess
340
+ * ::nvtxDomainSyncUserReleasing
341
+ *
342
+ * \version \NVTX_VERSION_2
343
+ */
344
+ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle);
345
+
346
+ /* ------------------------------------------------------------------------- */
347
+ /** \brief Signal to tools of failure in acquiring a user defined synchronization object
348
+ * This should be called after \ref nvtxDomainSyncUserAcquireStart
349
+ *
350
+ * \param handle - A handle to the object to operate on.
351
+ *
352
+ * \sa
353
+ * ::nvtxDomainSyncUserCreate
354
+ * ::nvtxDomainSyncUserDestroy
355
+ * ::nvtxDomainSyncUserAcquireStart
356
+ * ::nvtxDomainSyncUserAcquireFailed
357
+ * ::nvtxDomainSyncUserAcquireSuccess
358
+ * ::nvtxDomainSyncUserReleasing
359
+ *
360
+ * \version \NVTX_VERSION_2
361
+ */NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle);
362
+
363
+ /* ------------------------------------------------------------------------- */
364
+ /** \brief Signal to tools of success in acquiring a user defined synchronization object
365
+ * This should be called after \ref nvtxDomainSyncUserAcquireStart.
366
+ *
367
+ * \param handle - A handle to the object to operate on.
368
+ *
369
+ * \sa
370
+ * ::nvtxDomainSyncUserCreate
371
+ * ::nvtxDomainSyncUserDestroy
372
+ * ::nvtxDomainSyncUserAcquireStart
373
+ * ::nvtxDomainSyncUserAcquireFailed
374
+ * ::nvtxDomainSyncUserAcquireSuccess
375
+ * ::nvtxDomainSyncUserReleasing
376
+ *
377
+ * \version \NVTX_VERSION_2
378
+ */NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle);
379
+
380
+ /* ------------------------------------------------------------------------- */
381
+ /** \brief Signal to tools of releasing a reservation on user defined synchronization object
382
+ * This should be called after \ref nvtxDomainSyncUserAcquireSuccess.
383
+ *
384
+ * \param handle - A handle to the object to operate on.
385
+ *
386
+ * \sa
387
+ * ::nvtxDomainSyncUserCreate
388
+ * ::nvtxDomainSyncUserDestroy
389
+ * ::nvtxDomainSyncUserAcquireStart
390
+ * ::nvtxDomainSyncUserAcquireFailed
391
+ * ::nvtxDomainSyncUserAcquireSuccess
392
+ * ::nvtxDomainSyncUserReleasing
393
+ *
394
+ * \version \NVTX_VERSION_2
395
+ */
396
+ NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
397
+
398
+
399
+ /** @} */ /*END defgroup*/
400
+
401
+ #ifdef __cplusplus
402
+ }
403
+ #endif /* __cplusplus */
404
+
405
+ #ifndef NVTX_NO_IMPL
406
+ #define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
407
+ #include "nvtxDetail/nvtxImplSync_v3.h"
408
+ #undef NVTX_IMPL_GUARD_SYNC
409
+ #endif /*NVTX_NO_IMPL*/
410
+
411
+ #endif /* NVTOOLSEXT_SYNC_V3 */
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h ADDED
@@ -0,0 +1,469 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* This file was procedurally generated! Do not modify this file by hand. */
2
+
3
+ /*
4
+ * Copyright 2009-2016 NVIDIA Corporation. All rights reserved.
5
+ *
6
+ * NOTICE TO USER:
7
+ *
8
+ * This source code is subject to NVIDIA ownership rights under U.S. and
9
+ * international Copyright laws.
10
+ *
11
+ * This software and the information contained herein is PROPRIETARY and
12
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
13
+ * of a form of NVIDIA software license agreement.
14
+ *
15
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
16
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
17
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
18
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
19
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
20
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
21
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
22
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
23
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
24
+ * OR PERFORMANCE OF THIS SOURCE CODE.
25
+ *
26
+ * U.S. Government End Users. This source code is a "commercial item" as
27
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
28
+ * "commercial computer software" and "commercial computer software
29
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
30
+ * and is provided to the U.S. Government only as a commercial end item.
31
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
32
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
33
+ * source code with only those rights set forth herein.
34
+ *
35
+ * Any use of this source code in individual and commercial software must
36
+ * include, in the user documentation and internal comments to the code,
37
+ * the above Disclaimer and U.S. Government End Users Notice.
38
+ */
39
+
40
+ #ifndef NVTX_IMPL_GUARD
41
+ #error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
42
+ #endif
43
+
44
+ /* ---- Include required platform headers ---- */
45
+
46
+ #if defined(_WIN32)
47
+
48
+ #include <Windows.h>
49
+
50
+ #else
51
+ #include <unistd.h>
52
+
53
+ #if defined(__ANDROID__)
54
+ #include <android/api-level.h>
55
+ #endif
56
+
57
+ #if defined(__linux__) || defined(__CYGWIN__)
58
+ #include <sched.h>
59
+ #endif
60
+
61
+ #include <limits.h>
62
+ #include <dlfcn.h>
63
+ #include <fcntl.h>
64
+ #include <stdlib.h>
65
+ #include <stdio.h>
66
+ #include <sys/types.h>
67
+ #include <unistd.h>
68
+ #include <errno.h>
69
+
70
+ #include <string.h>
71
+ #include <sys/types.h>
72
+ #include <pthread.h>
73
+ #include <stdlib.h>
74
+ #include <wchar.h>
75
+
76
+ #endif
77
+
78
+ /* ---- Define macros used in this file ---- */
79
+
80
+ #define NVTX_INIT_STATE_FRESH 0
81
+ #define NVTX_INIT_STATE_STARTED 1
82
+ #define NVTX_INIT_STATE_COMPLETE 2
83
+
84
+ #ifdef NVTX_DEBUG_PRINT
85
+ #ifdef __ANDROID__
86
+ #include <android/log.h>
87
+ #define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
88
+ #define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
89
+ #else
90
+ #include <stdio.h>
91
+ #define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
92
+ #define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
93
+ #endif
94
+ #else /* !defined(NVTX_DEBUG_PRINT) */
95
+ #define NVTX_ERR(...)
96
+ #define NVTX_INFO(...)
97
+ #endif
98
+
99
+ #ifdef __cplusplus
100
+ extern "C" {
101
+ #endif /* __cplusplus */
102
+
103
+ #ifdef __GNUC__
104
+ #pragma GCC visibility push(hidden)
105
+ #endif
106
+
107
+ /* ---- Forward declare all functions referenced in globals ---- */
108
+
109
+ NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void);
110
+ NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
111
+ NvtxCallbackModule module,
112
+ NvtxFunctionTable* out_table,
113
+ unsigned int* out_size);
114
+ NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(
115
+ uint32_t version);
116
+ NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(
117
+ uint32_t exportTableId);
118
+
119
+ #include "nvtxInitDecls.h"
120
+
121
+ /* ---- Define all globals ---- */
122
+
123
+ typedef struct nvtxGlobals_t
124
+ {
125
+ volatile unsigned int initState;
126
+ NvtxExportTableCallbacks etblCallbacks;
127
+ NvtxExportTableVersionInfo etblVersionInfo;
128
+
129
+ /* Implementation function pointers */
130
+ nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr;
131
+ nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr;
132
+ nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr;
133
+ nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr;
134
+ nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr;
135
+ nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr;
136
+ nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr;
137
+ nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr;
138
+ nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr;
139
+ nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr;
140
+ nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr;
141
+ nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr;
142
+ nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr;
143
+ nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr;
144
+ nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr;
145
+
146
+ nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr;
147
+ nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr;
148
+ nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr;
149
+ nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr;
150
+ nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr;
151
+ nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr;
152
+ nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr;
153
+ nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr;
154
+
155
+ nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr;
156
+ nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr;
157
+ nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr;
158
+ nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr;
159
+ nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr;
160
+ nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr;
161
+ nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr;
162
+ nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr;
163
+ nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr;
164
+ nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr;
165
+ nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr;
166
+ nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr;
167
+ nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr;
168
+ nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr;
169
+
170
+ nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr;
171
+ nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr;
172
+ nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr;
173
+ nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr;
174
+ nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr;
175
+ nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr;
176
+
177
+ nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr;
178
+ nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr;
179
+ nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr;
180
+ nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr;
181
+ nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr;
182
+ nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr;
183
+ nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr;
184
+ nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr;
185
+ nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr;
186
+ nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr;
187
+ nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr;
188
+ nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr;
189
+ nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr;
190
+ nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr;
191
+ nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr;
192
+
193
+ nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr;
194
+ nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr;
195
+ nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr;
196
+ nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr;
197
+ nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
198
+ nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr;
199
+
200
+ /* Tables of function pointers -- Extra null added to the end to ensure
201
+ * a crash instead of silent corruption if a tool reads off the end. */
202
+ NvtxFunctionPointer* functionTable_CORE [NVTX_CBID_CORE_SIZE + 1];
203
+ NvtxFunctionPointer* functionTable_CUDA [NVTX_CBID_CUDA_SIZE + 1];
204
+ NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1];
205
+ NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1];
206
+ NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE + 1];
207
+ NvtxFunctionPointer* functionTable_SYNC [NVTX_CBID_SYNC_SIZE + 1];
208
+ } nvtxGlobals_t;
209
+
210
+ NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) =
211
+ {
212
+ NVTX_INIT_STATE_FRESH,
213
+
214
+ {
215
+ sizeof(NvtxExportTableCallbacks),
216
+ NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)
217
+ },
218
+ {
219
+ sizeof(NvtxExportTableVersionInfo),
220
+ NVTX_VERSION,
221
+ 0,
222
+ NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)
223
+ },
224
+
225
+ /* Implementation function pointers */
226
+ NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init),
227
+ NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init),
228
+ NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init),
229
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init),
230
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init),
231
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init),
232
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init),
233
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init),
234
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init),
235
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init),
236
+ NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init),
237
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init),
238
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init),
239
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init),
240
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init),
241
+
242
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init),
243
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init),
244
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init),
245
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init),
246
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init),
247
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init),
248
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init),
249
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init),
250
+
251
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init),
252
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init),
253
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init),
254
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init),
255
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init),
256
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init),
257
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init),
258
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init),
259
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init),
260
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init),
261
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init),
262
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init),
263
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init),
264
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init),
265
+
266
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init),
267
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init),
268
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init),
269
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init),
270
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init),
271
+ NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init),
272
+
273
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init),
274
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init),
275
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init),
276
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init),
277
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init),
278
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init),
279
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init),
280
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init),
281
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init),
282
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init),
283
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init),
284
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init),
285
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init),
286
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init),
287
+ NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init),
288
+
289
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init),
290
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init),
291
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init),
292
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init),
293
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init),
294
+ NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init),
295
+
296
+ /* Tables of function pointers */
297
+ {
298
+ 0,
299
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr,
300
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr,
301
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr,
302
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr,
303
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr,
304
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr,
305
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr,
306
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr,
307
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr,
308
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr,
309
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr,
310
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr,
311
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr,
312
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr,
313
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr,
314
+ 0
315
+ },
316
+ {
317
+ 0,
318
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr,
319
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr,
320
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr,
321
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr,
322
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr,
323
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr,
324
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr,
325
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr,
326
+ 0
327
+ },
328
+ {
329
+ 0,
330
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr,
331
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr,
332
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr,
333
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr,
334
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr,
335
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr,
336
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr,
337
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr,
338
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr,
339
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr,
340
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr,
341
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr,
342
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr,
343
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr,
344
+ 0
345
+ },
346
+ {
347
+ 0,
348
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr,
349
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr,
350
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr,
351
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr,
352
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr,
353
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr,
354
+ 0
355
+ },
356
+ {
357
+ 0,
358
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr,
359
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr,
360
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr,
361
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr,
362
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr,
363
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr,
364
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr,
365
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr,
366
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr,
367
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr,
368
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr,
369
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr,
370
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr,
371
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr,
372
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr,
373
+ 0
374
+ },
375
+ {
376
+ 0,
377
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr,
378
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr,
379
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr,
380
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr,
381
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr,
382
+ (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr,
383
+ 0
384
+ }
385
+ };
386
+
387
+ /* ---- Define static inline implementations of core API functions ---- */
388
+
389
+ #include "nvtxImplCore.h"
390
+
391
+ /* ---- Define implementations of export table functions ---- */
392
+
393
+ NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
394
+ NvtxCallbackModule module,
395
+ NvtxFunctionTable* out_table,
396
+ unsigned int* out_size)
397
+ {
398
+ unsigned int bytes = 0;
399
+ NvtxFunctionTable table = (NvtxFunctionTable)0;
400
+
401
+ switch (module)
402
+ {
403
+ case NVTX_CB_MODULE_CORE:
404
+ table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE;
405
+ bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE);
406
+ break;
407
+ case NVTX_CB_MODULE_CUDA:
408
+ table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA;
409
+ bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA);
410
+ break;
411
+ case NVTX_CB_MODULE_OPENCL:
412
+ table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL;
413
+ bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL);
414
+ break;
415
+ case NVTX_CB_MODULE_CUDART:
416
+ table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART;
417
+ bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART);
418
+ break;
419
+ case NVTX_CB_MODULE_CORE2:
420
+ table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2;
421
+ bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2);
422
+ break;
423
+ case NVTX_CB_MODULE_SYNC:
424
+ table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC;
425
+ bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC);
426
+ break;
427
+ default: return 0;
428
+ }
429
+
430
+ if (out_size)
431
+ *out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1;
432
+
433
+ if (out_table)
434
+ *out_table = table;
435
+
436
+ return 1;
437
+ }
438
+
439
+ NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId)
440
+ {
441
+ switch (exportTableId)
442
+ {
443
+ case NVTX_ETID_CALLBACKS: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks;
444
+ case NVTX_ETID_VERSIONINFO: return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo;
445
+ default: return 0;
446
+ }
447
+ }
448
+
449
+ NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version)
450
+ {
451
+ /* Reserved for custom implementations to resolve problems with tools */
452
+ (void)version;
453
+ }
454
+
455
+ /* ---- Define implementations of init versions of all API functions ---- */
456
+
457
+ #include "nvtxInitDefs.h"
458
+
459
+ /* ---- Define implementations of initialization functions ---- */
460
+
461
+ #include "nvtxInit.h"
462
+
463
+ #ifdef __GNUC__
464
+ #pragma GCC visibility pop
465
+ #endif
466
+
467
+ #ifdef __cplusplus
468
+ } /* extern "C" */
469
+ #endif /* __cplusplus */