koichi12 commited on Feb 12, 2025

Commit

0efc066

verified ·

1 Parent(s): bcc798f

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
.venv/lib/python3.11/site-packages/click/__pycache__/core.cpython-311.pyc +3 -0
.venv/lib/python3.11/site-packages/nvidia/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h +78 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h +282 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_defines.h +65 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/mma.h +60 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp +92 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h +106 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_types.h +180 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp +316 -0
.venv/lib/python3.11/site-packages/nvidia/curand/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand.h +1077 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h +87 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h +253 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_globals.h +93 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_kernel.h +1677 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_lognormal.h +697 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mrg32k3a.h +0 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h +210 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_host.h +516 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h +386 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32dc_p_11213.h +0 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h +840 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal_static.h +134 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_philox4x32_x.h +195 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_poisson.h +763 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_precalc.h +0 -0
.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_uniform.h +498 -0
.venv/lib/python3.11/site-packages/nvidia/curand/lib/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExt.h +1561 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCuda.h +164 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCudaRt.h +140 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtOpenCL.h +214 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtSync.h +406 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExt.h +1499 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtCuda.h +170 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtCudaRt.h +146 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtOpenCL.h +220 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtSync.h +411 -0
.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h +469 -0

.gitattributes CHANGED Viewed

@@ -116,3 +116,6 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/torchvision.libs/libz.5f199d92.so.1 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torchvision.libs/libjpeg.ceea7512.so.62 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/attr/__pycache__/_make.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/torchvision.libs/libz.5f199d92.so.1 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/torchvision.libs/libjpeg.ceea7512.so.62 filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/attr/__pycache__/_make.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/opencv_python_headless.libs/libgfortran-91cc3cb1.so.3.0.0 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/click/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/pyasn1/type/__pycache__/univ.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/click/__pycache__/core.cpython-311.pyc ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:484629aab5363f8454a60a903003e4f5a00aa9ce88c11751116cfec8fcae7c8b
+size 142553

.venv/lib/python3.11/site-packages/nvidia/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaProfilerTypedefs.h ADDED Viewed

	@@ -0,0 +1,78 @@

+/*
+ * Copyright 2020-2021 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef CUDAPROFILERTYPEDEFS_H
+#define CUDAPROFILERTYPEDEFS_H
+#include <cudaProfiler.h>
+#ifdef __cplusplus
+extern "C" {
+#endif // __cplusplus
+/*
+ * Macros for the latest version for each driver function in cudaProfiler.h
+ */
+#define PFN_cuProfilerInitialize  PFN_cuProfilerInitialize_v4000
+#define PFN_cuProfilerStart  PFN_cuProfilerStart_v4000
+#define PFN_cuProfilerStop  PFN_cuProfilerStop_v4000
+/**
+ * Type definitions for functions defined in cudaProfiler.h
+ */
+typedef CUresult (CUDAAPI *PFN_cuProfilerInitialize_v4000)(const char *configFile, const char *outputFile, CUoutput_mode outputMode);
+typedef CUresult (CUDAAPI *PFN_cuProfilerStart_v4000)(void);
+typedef CUresult (CUDAAPI *PFN_cuProfilerStop_v4000)(void);
+#ifdef __cplusplus
+}
+#endif // __cplusplus
+#endif // file guard

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cudaVDPAU.h ADDED Viewed

	@@ -0,0 +1,282 @@

+/*
+ * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef CUDAVDPAU_H
+#define CUDAVDPAU_H
+#ifdef CUDA_FORCE_API_VERSION
+#error "CUDA_FORCE_API_VERSION is no longer supported."
+#endif
+#define cuVDPAUCtxCreate cuVDPAUCtxCreate_v2
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * \defgroup CUDA_VDPAU VDPAU Interoperability
+ * \ingroup CUDA_DRIVER
+ *
+ * ___MANBRIEF___ VDPAU interoperability functions of the low-level CUDA driver
+ * API (___CURRENT_FILE___) ___ENDMANBRIEF___
+ *
+ * This section describes the VDPAU interoperability functions of the
+ * low-level CUDA driver application programming interface.
+ *
+ * @{
+ */
+/**
+ * \brief Gets the CUDA device associated with a VDPAU device
+ *
+ * Returns in \p *pDevice the CUDA device associated with a \p vdpDevice, if
+ * applicable.
+ *
+ * \param pDevice           - Device associated with vdpDevice
+ * \param vdpDevice         - A VdpDevice handle
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cudaVDPAUGetDevice
+ */
+CUresult CUDAAPI cuVDPAUGetDevice(CUdevice *pDevice, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+/**
+ * \brief Create a CUDA context for interoperability with VDPAU
+ *
+ * Creates a new CUDA context, initializes VDPAU interoperability, and
+ * associates the CUDA context with the calling thread. It must be called
+ * before performing any other VDPAU interoperability operations. It may fail
+ * if the needed VDPAU driver facilities are not available. For usage of the
+ * \p flags parameter, see ::cuCtxCreate().
+ *
+ * \param pCtx              - Returned CUDA context
+ * \param flags             - Options for CUDA context creation
+ * \param device            - Device on which to create the context
+ * \param vdpDevice         - The VdpDevice to interop with
+ * \param vdpGetProcAddress - VDPAU's VdpGetProcAddress function pointer
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_DEINITIALIZED,
+ * ::CUDA_ERROR_NOT_INITIALIZED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * ::CUDA_ERROR_INVALID_VALUE,
+ * ::CUDA_ERROR_OUT_OF_MEMORY
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuGraphicsVDPAURegisterVideoSurface,
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuVDPAUGetDevice
+ */
+CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+/**
+ * \brief Registers a VDPAU VdpVideoSurface object
+ *
+ * Registers the VdpVideoSurface specified by \p vdpSurface for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The VdpVideoSurface is presented as an array of subresources that may be
+ * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
+ * The exact number of valid \p arrayIndex values depends on the VDPAU surface
+ * format. The mapping is shown in the table below. \p mipLevel must be 0.
+ *
+ * \htmlonly
+ * <table>
+ * <tr><th>VdpChromaType                               </th><th>arrayIndex</th><th>Size     </th><th>Format</th><th>Content            </th></tr>
+ * <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_420</td><td>0         </td><td>w   x h/2</td><td>R8    </td><td>Top-field luma     </td></tr>
+ * <tr>                                                     <td>1         </td><td>w   x h/2</td><td>R8    </td><td>Bottom-field luma  </td></tr>
+ * <tr>                                                     <td>2         </td><td>w/2 x h/4</td><td>R8G8  </td><td>Top-field chroma   </td></tr>
+ * <tr>                                                     <td>3         </td><td>w/2 x h/4</td><td>R8G8  </td><td>Bottom-field chroma</td></tr>
+ * <tr><td rowspan="4" valign="top">VDP_CHROMA_TYPE_422</td><td>0         </td><td>w   x h/2</td><td>R8    </td><td>Top-field luma     </td></tr>
+ * <tr>                                                     <td>1         </td><td>w   x h/2</td><td>R8    </td><td>Bottom-field luma  </td></tr>
+ * <tr>                                                     <td>2         </td><td>w/2 x h/2</td><td>R8G8  </td><td>Top-field chroma   </td></tr>
+ * <tr>                                                     <td>3         </td><td>w/2 x h/2</td><td>R8G8  </td><td>Bottom-field chroma</td></tr>
+ * </table>
+ * \endhtmlonly
+ *
+ * \latexonly
+ * \begin{tabular}{|l|l|l|l|l|}
+ * \hline
+ * VdpChromaType          & arrayIndex & Size      & Format & Content             \\
+ * \hline
+ * VDP\_CHROMA\_TYPE\_420 & 0          & w x h/2   & R8     & Top-field luma      \\
+ *                        & 1          & w x h/2   & R8     & Bottom-field luma   \\
+ *                        & 2          & w/2 x h/4 & R8G8   & Top-field chroma    \\
+ *                        & 3          & w/2 x h/4 & R8G8   & Bottom-field chroma \\
+ * \hline
+ * VDP\_CHROMA\_TYPE\_422 & 0          & w x h/2   & R8     & Top-field luma      \\
+ *                        & 1          & w x h/2   & R8     & Bottom-field luma   \\
+ *                        & 2          & w/2 x h/2 & R8G8   & Top-field chroma    \\
+ *                        & 3          & w/2 x h/2 & R8G8   & Bottom-field chroma \\
+ * \hline
+ * \end{tabular}
+ * \endlatexonly
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param vdpSurface    - The VdpVideoSurface to be registered
+ * \param flags         - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
+ * ::cuGraphicsVDPAURegisterOutputSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuVDPAUGetDevice,
+ * ::cudaGraphicsVDPAURegisterVideoSurface
+ */
+CUresult CUDAAPI cuGraphicsVDPAURegisterVideoSurface(CUgraphicsResource *pCudaResource, VdpVideoSurface vdpSurface, unsigned int flags);
+/**
+ * \brief Registers a VDPAU VdpOutputSurface object
+ *
+ * Registers the VdpOutputSurface specified by \p vdpSurface for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_NONE: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_READ_ONLY: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::CU_GRAPHICS_MAP_RESOURCE_FLAGS_WRITE_DISCARD: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The VdpOutputSurface is presented as an array of subresources that may be
+ * accessed using pointers returned by ::cuGraphicsSubResourceGetMappedArray.
+ * The exact number of valid \p arrayIndex values depends on the VDPAU surface
+ * format. The mapping is shown in the table below. \p mipLevel must be 0.
+ *
+ * \htmlonly
+ * <table>
+ * <tr><th>VdpRGBAFormat              </th><th>arrayIndex</th><th>Size </th><th>Format </th><th>Content       </th></tr>
+ * <tr><td>VDP_RGBA_FORMAT_B8G8R8A8   </td><td>0         </td><td>w x h</td><td>ARGB8  </td><td>Entire surface</td></tr>
+ * <tr><td>VDP_RGBA_FORMAT_R10G10B10A2</td><td>0         </td><td>w x h</td><td>A2BGR10</td><td>Entire surface</td></tr>
+ * </table>
+ * \endhtmlonly
+ *
+ * \latexonly
+ * \begin{tabular}{|l|l|l|l|l|}
+ * \hline
+ * VdpRGBAFormat                  & arrayIndex & Size  & Format  & Content        \\
+ * \hline
+ * VDP\_RGBA\_FORMAT\_B8G8R8A8    & 0          & w x h & ARGB8   & Entire surface \\
+ * VDP\_RGBA\_FORMAT\_R10G10B10A2 & 0          & w x h & A2BGR10 & Entire surface \\
+ * \hline
+ * \end{tabular}
+ * \endlatexonly
+ *
+ * \param pCudaResource - Pointer to the returned object handle
+ * \param vdpSurface    - The VdpOutputSurface to be registered
+ * \param flags         - Map flags
+ *
+ * \return
+ * ::CUDA_SUCCESS,
+ * ::CUDA_ERROR_INVALID_HANDLE,
+ * ::CUDA_ERROR_ALREADY_MAPPED,
+ * ::CUDA_ERROR_INVALID_CONTEXT,
+ * \notefnerr
+ *
+ * \sa ::cuCtxCreate, ::cuVDPAUCtxCreate,
+ * ::cuGraphicsVDPAURegisterVideoSurface, ::cuGraphicsUnregisterResource,
+ * ::cuGraphicsResourceSetMapFlags, ::cuGraphicsMapResources,
+ * ::cuGraphicsUnmapResources, ::cuGraphicsSubResourceGetMappedArray,
+ * ::cuVDPAUGetDevice,
+ * ::cudaGraphicsVDPAURegisterOutputSurface
+ */
+CUresult CUDAAPI cuGraphicsVDPAURegisterOutputSurface(CUgraphicsResource *pCudaResource, VdpOutputSurface vdpSurface, unsigned int flags);
+/** @} */ /* END CUDA_VDPAU */
+#if defined(__CUDA_API_VERSION_INTERNAL)
+    #undef cuVDPAUCtxCreate
+    CUresult CUDAAPI cuVDPAUCtxCreate(CUcontext *pCtx, unsigned int flags, CUdevice device, VdpDevice vdpDevice, VdpGetProcAddress *vdpGetProcAddress);
+#endif /* __CUDA_API_VERSION_INTERNAL */
+#ifdef __cplusplus
+};
+#endif
+#endif

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_defines.h ADDED Viewed

	@@ -0,0 +1,65 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("host_defines.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "host_defines.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
+#endif
+#include "crt/host_defines.h"
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_DEFINES_H_WRAPPER__
+#endif

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/mma.h ADDED Viewed

	@@ -0,0 +1,60 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
+#endif
+#include "crt/mma.h"
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_MMA_H_WRAPPER__
+#endif

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_20_atomic_functions.hpp ADDED Viewed

	@@ -0,0 +1,92 @@

+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_20_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_20_ATOMIC_FUNCTIONS_HPP__
+#if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA)
+extern "C"
+{
+extern __device__ __device_builtin__ float __fAtomicAdd(float *address, float val);
+}
+#endif /* __CUDA_ARCH__ */
+#if defined(__CUDACC_RTC__)
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_20_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__SM_20_ATOMIC_FUNCTIONS_DECL__ float atomicAdd(float *address, float val)
+{
+  return __fAtomicAdd(address, val);
+}
+#endif /* __cplusplus && __CUDACC__ */
+#undef __SM_20_ATOMIC_FUNCTIONS_DECL__
+#endif /* !__SM_20_ATOMIC_FUNCTIONS_HPP__ */

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_35_intrinsics.h ADDED Viewed

	@@ -0,0 +1,106 @@

+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_35_INTRINSICS_H__)
+#define __SM_35_INTRINSICS_H__
+#endif /* !__SM_35_INTRINSICS_H__ */

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_types.h ADDED Viewed

	@@ -0,0 +1,180 @@

+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__TEXTURE_TYPES_H__)
+#define __TEXTURE_TYPES_H__
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "driver_types.h"
+#ifndef __CUDACC_RTC_MINIMAL__
+/**
+ * \addtogroup CUDART_TYPES
+ *
+ * @{
+ */
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#define cudaTextureType1D              0x01
+#define cudaTextureType2D              0x02
+#define cudaTextureType3D              0x03
+#define cudaTextureTypeCubemap         0x0C
+#define cudaTextureType1DLayered       0xF1
+#define cudaTextureType2DLayered       0xF2
+#define cudaTextureTypeCubemapLayered  0xFC
+/**
+ * CUDA texture address modes
+ */
+enum __device_builtin__ cudaTextureAddressMode
+{
+    cudaAddressModeWrap   = 0,    /**< Wrapping address mode */
+    cudaAddressModeClamp  = 1,    /**< Clamp to edge address mode */
+    cudaAddressModeMirror = 2,    /**< Mirror address mode */
+    cudaAddressModeBorder = 3     /**< Border address mode */
+};
+/**
+ * CUDA texture filter modes
+ */
+enum __device_builtin__ cudaTextureFilterMode
+{
+    cudaFilterModePoint  = 0,     /**< Point filter mode */
+    cudaFilterModeLinear = 1      /**< Linear filter mode */
+};
+/**
+ * CUDA texture read modes
+ */
+enum __device_builtin__ cudaTextureReadMode
+{
+    cudaReadModeElementType     = 0,  /**< Read texture as specified element type */
+    cudaReadModeNormalizedFloat = 1   /**< Read texture as normalized float */
+};
+/**
+ * CUDA texture descriptor
+ */
+struct __device_builtin__ cudaTextureDesc
+{
+    /**
+     * Texture address mode for up to 3 dimensions
+     */
+    enum cudaTextureAddressMode addressMode[3];
+    /**
+     * Texture filter mode
+     */
+    enum cudaTextureFilterMode  filterMode;
+    /**
+     * Texture read mode
+     */
+    enum cudaTextureReadMode    readMode;
+    /**
+     * Perform sRGB->linear conversion during texture read
+     */
+    int                         sRGB;
+    /**
+     * Texture Border Color
+     */
+    float                       borderColor[4];
+    /**
+     * Indicates whether texture reads are normalized or not
+     */
+    int                         normalizedCoords;
+    /**
+     * Limit to the anisotropy ratio
+     */
+    unsigned int                maxAnisotropy;
+    /**
+     * Mipmap filter mode
+     */
+    enum cudaTextureFilterMode  mipmapFilterMode;
+    /**
+     * Offset applied to the supplied mipmap level
+     */
+    float                       mipmapLevelBias;
+    /**
+     * Lower end of the mipmap level range to clamp access to
+     */
+    float                       minMipmapLevelClamp;
+    /**
+     * Upper end of the mipmap level range to clamp access to
+     */
+    float                       maxMipmapLevelClamp;
+    /**
+     * Disable any trilinear filtering optimizations.
+     */
+    int                         disableTrilinearOptimization;
+    /**
+     * Enable seamless cube map filtering.
+     */
+    int                         seamlessCubemap;
+};
+/**
+ * An opaque value that represents a CUDA texture object
+ */
+typedef __device_builtin__ unsigned long long cudaTextureObject_t;
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+#endif  /* !__CUDACC_RTC_MINIMAL__ */
+#endif /* !__TEXTURE_TYPES_H__ */

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/vector_functions.hpp ADDED Viewed

	@@ -0,0 +1,316 @@

+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__VECTOR_FUNCTIONS_HPP__)
+#define __VECTOR_FUNCTIONS_HPP__
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+#if defined(__CUDACC_RTC__)
+#define __VECTOR_FUNCTIONS_DECL__ __host__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#endif /* __CUDACC_RTC__ */
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__VECTOR_FUNCTIONS_DECL__ char1 make_char1(signed char x)
+{
+  char1 t; t.x = x; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ uchar1 make_uchar1(unsigned char x)
+{
+  uchar1 t; t.x = x; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ char2 make_char2(signed char x, signed char y)
+{
+  char2 t; t.x = x; t.y = y; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ uchar2 make_uchar2(unsigned char x, unsigned char y)
+{
+  uchar2 t; t.x = x; t.y = y; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ char3 make_char3(signed char x, signed char y, signed char z)
+{
+  char3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ uchar3 make_uchar3(unsigned char x, unsigned char y, unsigned char z)
+{
+  uchar3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ char4 make_char4(signed char x, signed char y, signed char z, signed char w)
+{
+  char4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ uchar4 make_uchar4(unsigned char x, unsigned char y, unsigned char z, unsigned char w)
+{
+  uchar4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ short1 make_short1(short x)
+{
+  short1 t; t.x = x; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ ushort1 make_ushort1(unsigned short x)
+{
+  ushort1 t; t.x = x; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ short2 make_short2(short x, short y)
+{
+  short2 t; t.x = x; t.y = y; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ ushort2 make_ushort2(unsigned short x, unsigned short y)
+{
+  ushort2 t; t.x = x; t.y = y; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ short3 make_short3(short x,short y, short z)
+{
+  short3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ ushort3 make_ushort3(unsigned short x, unsigned short y, unsigned short z)
+{
+  ushort3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ short4 make_short4(short x, short y, short z, short w)
+{
+  short4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ ushort4 make_ushort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
+{
+  ushort4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ int1 make_int1(int x)
+{
+  int1 t; t.x = x; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ uint1 make_uint1(unsigned int x)
+{
+  uint1 t; t.x = x; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ int2 make_int2(int x, int y)
+{
+  int2 t; t.x = x; t.y = y; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ uint2 make_uint2(unsigned int x, unsigned int y)
+{
+  uint2 t; t.x = x; t.y = y; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ int3 make_int3(int x, int y, int z)
+{
+  int3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ uint3 make_uint3(unsigned int x, unsigned int y, unsigned int z)
+{
+  uint3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ int4 make_int4(int x, int y, int z, int w)
+{
+  int4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ uint4 make_uint4(unsigned int x, unsigned int y, unsigned int z, unsigned int w)
+{
+  uint4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ long1 make_long1(long int x)
+{
+  long1 t; t.x = x; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ ulong1 make_ulong1(unsigned long int x)
+{
+  ulong1 t; t.x = x; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ long2 make_long2(long int x, long int y)
+{
+  long2 t; t.x = x; t.y = y; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ ulong2 make_ulong2(unsigned long int x, unsigned long int y)
+{
+  ulong2 t; t.x = x; t.y = y; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ long3 make_long3(long int x, long int y, long int z)
+{
+  long3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ ulong3 make_ulong3(unsigned long int x, unsigned long int y, unsigned long int z)
+{
+  ulong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ long4 make_long4(long int x, long int y, long int z, long int w)
+{
+  long4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ ulong4 make_ulong4(unsigned long int x, unsigned long int y, unsigned long int z, unsigned long int w)
+{
+  ulong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ float1 make_float1(float x)
+{
+  float1 t; t.x = x; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ float2 make_float2(float x, float y)
+{
+  float2 t; t.x = x; t.y = y; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ float3 make_float3(float x, float y, float z)
+{
+  float3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ float4 make_float4(float x, float y, float z, float w)
+{
+  float4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ longlong1 make_longlong1(long long int x)
+{
+  longlong1 t; t.x = x; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ ulonglong1 make_ulonglong1(unsigned long long int x)
+{
+  ulonglong1 t; t.x = x; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ longlong2 make_longlong2(long long int x, long long int y)
+{
+  longlong2 t; t.x = x; t.y = y; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ ulonglong2 make_ulonglong2(unsigned long long int x, unsigned long long int y)
+{
+  ulonglong2 t; t.x = x; t.y = y; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ longlong3 make_longlong3(long long int x, long long int y, long long int z)
+{
+  longlong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ ulonglong3 make_ulonglong3(unsigned long long int x, unsigned long long int y, unsigned long long int z)
+{
+  ulonglong3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ longlong4 make_longlong4(long long int x, long long int y, long long int z, long long int w)
+{
+  longlong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ ulonglong4 make_ulonglong4(unsigned long long int x, unsigned long long int y, unsigned long long int z, unsigned long long int w)
+{
+  ulonglong4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ double1 make_double1(double x)
+{
+  double1 t; t.x = x; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ double2 make_double2(double x, double y)
+{
+  double2 t; t.x = x; t.y = y; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ double3 make_double3(double x, double y, double z)
+{
+  double3 t; t.x = x; t.y = y; t.z = z; return t;
+}
+__VECTOR_FUNCTIONS_DECL__ double4 make_double4(double x, double y, double z, double w)
+{
+  double4 t; t.x = x; t.y = y; t.z = z; t.w = w; return t;
+}
+#undef __VECTOR_FUNCTIONS_DECL__
+#endif /* !__VECTOR_FUNCTIONS_HPP__ */

.venv/lib/python3.11/site-packages/nvidia/curand/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/curand/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (186 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/curand/include/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/curand/include/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (194 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand.h ADDED Viewed

	@@ -0,0 +1,1077 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#if !defined(CURAND_H_)
+#define CURAND_H_
+/**
+ * \defgroup HOST Host API
+ *
+ * @{
+ */
+#ifndef __CUDACC_RTC__
+#include <cuda_runtime.h>
+#endif
+#ifndef CURANDAPI
+#ifdef _WIN32
+#define CURANDAPI __stdcall
+#else
+#define CURANDAPI
+#endif
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+#define CURAND_VER_MAJOR 10
+#define CURAND_VER_MINOR 3
+#define CURAND_VER_PATCH 5
+#define CURAND_VER_BUILD 147
+#define CURAND_VERSION (CURAND_VER_MAJOR * 1000 + \
+                        CURAND_VER_MINOR *  100 + \
+                        CURAND_VER_PATCH)
+/* CURAND Host API datatypes */
+/**
+ * @{
+ */
+/**
+ * CURAND function call status types
+ */
+enum curandStatus {
+    CURAND_STATUS_SUCCESS = 0, ///< No errors
+    CURAND_STATUS_VERSION_MISMATCH = 100, ///< Header file and linked library version do not match
+    CURAND_STATUS_NOT_INITIALIZED = 101, ///< Generator not initialized
+    CURAND_STATUS_ALLOCATION_FAILED = 102, ///< Memory allocation failed
+    CURAND_STATUS_TYPE_ERROR = 103, ///< Generator is wrong type
+    CURAND_STATUS_OUT_OF_RANGE = 104, ///< Argument out of range
+    CURAND_STATUS_LENGTH_NOT_MULTIPLE = 105, ///< Length requested is not a multple of dimension
+    CURAND_STATUS_DOUBLE_PRECISION_REQUIRED = 106, ///< GPU does not have double precision required by MRG32k3a
+    CURAND_STATUS_LAUNCH_FAILURE = 201, ///< Kernel launch failure
+    CURAND_STATUS_PREEXISTING_FAILURE = 202, ///< Preexisting failure on library entry
+    CURAND_STATUS_INITIALIZATION_FAILED = 203, ///< Initialization of CUDA failed
+    CURAND_STATUS_ARCH_MISMATCH = 204, ///< Architecture mismatch, GPU does not support requested feature
+    CURAND_STATUS_INTERNAL_ERROR = 999 ///< Internal library error
+};
+/*
+ * CURAND function call status types
+*/
+/** \cond UNHIDE_TYPEDEFS */
+typedef enum curandStatus curandStatus_t;
+/** \endcond */
+/**
+ * CURAND generator types
+ */
+enum curandRngType {
+    CURAND_RNG_TEST = 0,
+    CURAND_RNG_PSEUDO_DEFAULT = 100, ///< Default pseudorandom generator
+    CURAND_RNG_PSEUDO_XORWOW = 101, ///< XORWOW pseudorandom generator
+    CURAND_RNG_PSEUDO_MRG32K3A = 121, ///< MRG32k3a pseudorandom generator
+    CURAND_RNG_PSEUDO_MTGP32 = 141, ///< Mersenne Twister MTGP32 pseudorandom generator
+    CURAND_RNG_PSEUDO_MT19937 = 142, ///< Mersenne Twister MT19937 pseudorandom generator
+    CURAND_RNG_PSEUDO_PHILOX4_32_10 = 161, ///< PHILOX-4x32-10 pseudorandom generator
+    CURAND_RNG_QUASI_DEFAULT = 200, ///< Default quasirandom generator
+    CURAND_RNG_QUASI_SOBOL32 = 201, ///< Sobol32 quasirandom generator
+    CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 = 202,  ///< Scrambled Sobol32 quasirandom generator
+    CURAND_RNG_QUASI_SOBOL64 = 203, ///< Sobol64 quasirandom generator
+    CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 = 204  ///< Scrambled Sobol64 quasirandom generator
+};
+/*
+ * CURAND generator types
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef enum curandRngType curandRngType_t;
+/** \endcond */
+/**
+ * CURAND ordering of results in memory
+ */
+enum curandOrdering {
+    CURAND_ORDERING_PSEUDO_BEST = 100, ///< Best ordering for pseudorandom results
+    CURAND_ORDERING_PSEUDO_DEFAULT = 101, ///< Specific default thread sequence for pseudorandom results, same as CURAND_ORDERING_PSEUDO_BEST
+    CURAND_ORDERING_PSEUDO_SEEDED = 102, ///< Specific seeding pattern for fast lower quality pseudorandom results
+    CURAND_ORDERING_PSEUDO_LEGACY = 103, ///< Specific legacy sequence for pseudorandom results, guaranteed to remain the same for all cuRAND release
+    CURAND_ORDERING_PSEUDO_DYNAMIC = 104, ///< Specific ordering adjusted to the device it is being executed on, provides the best performance
+    CURAND_ORDERING_QUASI_DEFAULT = 201 ///< Specific n-dimensional ordering for quasirandom results
+};
+/*
+ * CURAND ordering of results in memory
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef enum curandOrdering curandOrdering_t;
+/** \endcond */
+/**
+ * CURAND choice of direction vector set
+ */
+enum curandDirectionVectorSet {
+    CURAND_DIRECTION_VECTORS_32_JOEKUO6 = 101, ///< Specific set of 32-bit direction vectors generated from polynomials recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions
+    CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 = 102, ///< Specific set of 32-bit direction vectors generated from polynomials recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions, and scrambled
+    CURAND_DIRECTION_VECTORS_64_JOEKUO6 = 103, ///< Specific set of 64-bit direction vectors generated from polynomials recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions
+    CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 = 104 ///< Specific set of 64-bit direction vectors generated from polynomials recommended by S. Joe and F. Y. Kuo, for up to 20,000 dimensions, and scrambled
+};
+/*
+ * CURAND choice of direction vector set
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef enum curandDirectionVectorSet curandDirectionVectorSet_t;
+/** \endcond */
+/**
+ * CURAND array of 32-bit direction vectors
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef unsigned int curandDirectionVectors32_t[32];
+/** \endcond */
+ /**
+ * CURAND array of 64-bit direction vectors
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef unsigned long long curandDirectionVectors64_t[64];
+/** \endcond **/
+/**
+ * CURAND generator (opaque)
+ */
+struct curandGenerator_st;
+/**
+ * CURAND generator
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandGenerator_st *curandGenerator_t;
+/** \endcond */
+/**
+ * CURAND distribution
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef double curandDistribution_st;
+typedef curandDistribution_st *curandDistribution_t;
+typedef struct curandDistributionShift_st *curandDistributionShift_t;
+/** \endcond */
+/**
+ * CURAND distribution M2
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandDistributionM2Shift_st *curandDistributionM2Shift_t;
+typedef struct curandHistogramM2_st *curandHistogramM2_t;
+typedef unsigned int curandHistogramM2K_st;
+typedef curandHistogramM2K_st *curandHistogramM2K_t;
+typedef curandDistribution_st curandHistogramM2V_st;
+typedef curandHistogramM2V_st *curandHistogramM2V_t;
+typedef struct curandDiscreteDistribution_st *curandDiscreteDistribution_t;
+/** \endcond */
+/*
+ * CURAND METHOD
+ */
+/** \cond UNHIDE_ENUMS */
+enum curandMethod {
+    CURAND_CHOOSE_BEST = 0, // choose best depends on args
+    CURAND_ITR = 1,
+    CURAND_KNUTH = 2,
+    CURAND_HITR = 3,
+    CURAND_M1 = 4,
+    CURAND_M2 = 5,
+    CURAND_BINARY_SEARCH = 6,
+    CURAND_DISCRETE_GAUSS = 7,
+    CURAND_REJECTION = 8,
+    CURAND_DEVICE_API = 9,
+    CURAND_FAST_REJECTION = 10,
+    CURAND_3RD = 11,
+    CURAND_DEFINITION = 12,
+    CURAND_POISSON = 13
+};
+typedef enum curandMethod curandMethod_t;
+/** \endcond */
+#ifndef __CUDACC_RTC__
+/**
+ * @}
+ */
+/**
+ * \brief Create new random number generator.
+ *
+ * Creates a new random number generator of type \p rng_type
+ * and returns it in \p *generator.
+ *
+ * Legal values for \p rng_type are:
+ * - CURAND_RNG_PSEUDO_DEFAULT
+ * - CURAND_RNG_PSEUDO_XORWOW
+ * - CURAND_RNG_PSEUDO_MRG32K3A
+ * - CURAND_RNG_PSEUDO_MTGP32
+ * - CURAND_RNG_PSEUDO_MT19937
+ * - CURAND_RNG_PSEUDO_PHILOX4_32_10
+ * - CURAND_RNG_QUASI_DEFAULT
+ * - CURAND_RNG_QUASI_SOBOL32
+ * - CURAND_RNG_QUASI_SCRAMBLED_SOBOL32
+ * - CURAND_RNG_QUASI_SOBOL64
+ * - CURAND_RNG_QUASI_SCRAMBLED_SOBOL64
+ *
+ * When \p rng_type is CURAND_RNG_PSEUDO_DEFAULT, the type chosen
+ * is CURAND_RNG_PSEUDO_XORWOW.  \n
+ * When \p rng_type is CURAND_RNG_QUASI_DEFAULT,
+ * the type chosen is CURAND_RNG_QUASI_SOBOL32.
+ *
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_XORWOW are:
+ * - \p seed = 0
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_MRG32K3A are:
+ * - \p seed = 0
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_MTGP32 are:
+ * - \p seed = 0
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_MT19937 are:
+ * - \p seed = 0
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
+ *
+ * * The default values for \p rng_type = CURAND_RNG_PSEUDO_PHILOX4_32_10 are:
+ * - \p seed = 0
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SOBOL32 are:
+ * - \p dimensions = 1
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SOBOL64 are:
+ * - \p dimensions = 1
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SCRAMBBLED_SOBOL32 are:
+ * - \p dimensions = 1
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 are:
+ * - \p dimensions = 1
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
+ *
+ * \param generator - Pointer to generator
+ * \param rng_type - Type of generator to create
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED, if memory could not be allocated \n
+ * - CURAND_STATUS_INITIALIZATION_FAILED if there was a problem setting up the GPU \n
+ * - CURAND_STATUS_VERSION_MISMATCH if the header file version does not match the
+ *   dynamically linked library version \n
+ * - CURAND_STATUS_TYPE_ERROR if the value for \p rng_type is invalid \n
+ * - CURAND_STATUS_SUCCESS if generator was created successfully \n
+ *
+ */
+curandStatus_t CURANDAPI
+curandCreateGenerator(curandGenerator_t *generator, curandRngType_t rng_type);
+/**
+ * \brief Create new host CPU random number generator.
+ *
+ * Creates a new host CPU random number generator of type \p rng_type
+ * and returns it in \p *generator.
+ *
+ * Legal values for \p rng_type are:
+ * - CURAND_RNG_PSEUDO_DEFAULT
+ * - CURAND_RNG_PSEUDO_XORWOW
+ * - CURAND_RNG_PSEUDO_MRG32K3A
+ * - CURAND_RNG_PSEUDO_MTGP32
+ * - CURAND_RNG_PSEUDO_MT19937
+ * - CURAND_RNG_PSEUDO_PHILOX4_32_10
+ * - CURAND_RNG_QUASI_DEFAULT
+ * - CURAND_RNG_QUASI_SOBOL32
+ *
+ * When \p rng_type is CURAND_RNG_PSEUDO_DEFAULT, the type chosen
+ * is CURAND_RNG_PSEUDO_XORWOW.  \n
+ * When \p rng_type is CURAND_RNG_QUASI_DEFAULT,
+ * the type chosen is CURAND_RNG_QUASI_SOBOL32.
+ *
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_XORWOW are:
+ * - \p seed = 0
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_MRG32K3A are:
+ * - \p seed = 0
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_MTGP32 are:
+ * - \p seed = 0
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_PSEUDO_MT19937 are:
+ * - \p seed = 0
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
+ *
+ * * The default values for \p rng_type = CURAND_RNG_PSEUDO_PHILOX4_32_10 are:
+ * - \p seed = 0
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_PSEUDO_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SOBOL32 are:
+ * - \p dimensions = 1
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SOBOL64 are:
+ * - \p dimensions = 1
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SCRAMBLED_SOBOL32 are:
+ * - \p dimensions = 1
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
+ *
+ * The default values for \p rng_type = CURAND_RNG_QUASI_SCRAMBLED_SOBOL64 are:
+ * - \p dimensions = 1
+ * - \p offset = 0
+ * - \p ordering = CURAND_ORDERING_QUASI_DEFAULT
+ *
+ * \param generator - Pointer to generator
+ * \param rng_type - Type of generator to create
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
+ * - CURAND_STATUS_INITIALIZATION_FAILED if there was a problem setting up the GPU \n
+ * - CURAND_STATUS_VERSION_MISMATCH if the header file version does not match the
+ *   dynamically linked library version \n
+ * - CURAND_STATUS_TYPE_ERROR if the value for \p rng_type is invalid \n
+ * - CURAND_STATUS_SUCCESS if generator was created successfully \n
+ */
+curandStatus_t CURANDAPI
+curandCreateGeneratorHost(curandGenerator_t *generator, curandRngType_t rng_type);
+/**
+ * \brief Destroy an existing generator.
+ *
+ * Destroy an existing generator and free all memory associated with its state.
+ *
+ * \param generator - Generator to destroy
+ *
+ * \return
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_SUCCESS if generator was destroyed successfully \n
+ */
+curandStatus_t CURANDAPI
+curandDestroyGenerator(curandGenerator_t generator);
+/**
+ * \brief Return the version number of the library.
+ *
+ * Return in \p *version the version number of the dynamically linked CURAND
+ * library.  The format is the same as CUDART_VERSION from the CUDA Runtime.
+ * The only supported configuration is CURAND version equal to CUDA Runtime
+ * version.
+ *
+ * \param version - CURAND library version
+ *
+ * \return
+ * - CURAND_STATUS_SUCCESS if the version number was successfully returned \n
+ */
+curandStatus_t CURANDAPI
+curandGetVersion(int *version);
+/**
+* \brief Return the value of the curand property.
+*
+* Return in \p *value the number for the property described by \p type of the
+* dynamically linked CURAND library.
+*
+* \param type - CUDA library property
+* \param value - integer value for the requested property
+*
+* \return
+* - CURAND_STATUS_SUCCESS if the property value was successfully returned \n
+* - CURAND_STATUS_OUT_OF_RANGE if the property type is not recognized \n
+*/
+curandStatus_t CURANDAPI
+curandGetProperty(libraryPropertyType type, int *value);
+/**
+ * \brief Set the current stream for CURAND kernel launches.
+ *
+ * Set the current stream for CURAND kernel launches.  All library functions
+ * will use this stream until set again.
+ *
+ * \param generator - Generator to modify
+ * \param stream - Stream to use or ::NULL for null stream
+ *
+ * \return
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_SUCCESS if stream was set successfully \n
+ */
+curandStatus_t CURANDAPI
+curandSetStream(curandGenerator_t generator, cudaStream_t stream);
+/**
+ * \brief Set the seed value of the pseudo-random number generator.
+ *
+ * Set the seed value of the pseudorandom number generator.
+ * All values of seed are valid.  Different seeds will produce different sequences.
+ * Different seeds will often not be statistically correlated with each other,
+ * but some pairs of seed values may generate sequences which are statistically correlated.
+ *
+ * \param generator - Generator to modify
+ * \param seed - Seed value
+ *
+ * \return
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_TYPE_ERROR if the generator is not a pseudorandom number generator \n
+ * - CURAND_STATUS_SUCCESS if generator seed was set successfully \n
+ */
+curandStatus_t CURANDAPI
+curandSetPseudoRandomGeneratorSeed(curandGenerator_t generator, unsigned long long seed);
+/**
+ * \brief Set the absolute offset of the pseudo or quasirandom number generator.
+ *
+ * Set the absolute offset of the pseudo or quasirandom number generator.
+ *
+ * All values of offset are valid.  The offset position is absolute, not
+ * relative to the current position in the sequence.
+ *
+ * \param generator - Generator to modify
+ * \param offset - Absolute offset position
+ *
+ * \return
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_SUCCESS if generator offset was set successfully \n
+ */
+curandStatus_t CURANDAPI
+curandSetGeneratorOffset(curandGenerator_t generator, unsigned long long offset);
+/**
+ * \brief Set the ordering of results of the pseudo or quasirandom number generator.
+ *
+ * Set the ordering of results of the pseudo or quasirandom number generator.
+ *
+ * Legal values of \p order for pseudorandom generators are:
+ * - CURAND_ORDERING_PSEUDO_DEFAULT
+ * - CURAND_ORDERING_PSEUDO_BEST
+ * - CURAND_ORDERING_PSEUDO_SEEDED
+ * - CURAND_ORDERING_PSEUDO_LEGACY
+ *
+ * Legal values of \p order for quasirandom generators are:
+ * - CURAND_ORDERING_QUASI_DEFAULT
+ *
+ * \param generator - Generator to modify
+ * \param order - Ordering of results
+ *
+ * \return
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_OUT_OF_RANGE if the ordering is not valid \n
+ * - CURAND_STATUS_SUCCESS if generator ordering was set successfully \n
+ */
+curandStatus_t CURANDAPI
+curandSetGeneratorOrdering(curandGenerator_t generator, curandOrdering_t order);
+/**
+ * \brief Set the number of dimensions.
+ *
+ * Set the number of dimensions to be generated by the quasirandom number
+ * generator.
+ *
+ * Legal values for \p num_dimensions are 1 to 20000.
+ *
+ * \param generator - Generator to modify
+ * \param num_dimensions - Number of dimensions
+ *
+ * \return
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_OUT_OF_RANGE if num_dimensions is not valid \n
+ * - CURAND_STATUS_TYPE_ERROR if the generator is not a quasirandom number generator \n
+ * - CURAND_STATUS_SUCCESS if generator ordering was set successfully \n
+ */
+curandStatus_t CURANDAPI
+curandSetQuasiRandomGeneratorDimensions(curandGenerator_t generator, unsigned int num_dimensions);
+/**
+ * \brief Generate 32-bit pseudo or quasirandom numbers.
+ *
+ * Use \p generator to generate \p num 32-bit results into the device memory at
+ * \p outputPtr.  The device memory must have been previously allocated and be
+ * large enough to hold all the results.  Launches are done with the stream
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
+ *
+ * Results are 32-bit values with every bit random.
+ *
+ * \param generator - Generator to use
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
+ *                 Pointer to host memory to store CPU-generated results
+ * \param num - Number of random 32-bit values to generate
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
+ *     a previous kernel launch \n
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
+ *    not a multiple of the quasirandom dimension \n
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
+ * - CURAND_STATUS_TYPE_ERROR if the generator is a 64 bit quasirandom generator.
+ * (use ::curandGenerateLongLong() with 64 bit quasirandom generators)
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGenerate(curandGenerator_t generator, unsigned int *outputPtr, size_t num);
+/**
+ * \brief Generate 64-bit quasirandom numbers.
+ *
+ * Use \p generator to generate \p num 64-bit results into the device memory at
+ * \p outputPtr.  The device memory must have been previously allocated and be
+ * large enough to hold all the results.  Launches are done with the stream
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
+ *
+ * Results are 64-bit values with every bit random.
+ *
+ * \param generator - Generator to use
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
+ *                 Pointer to host memory to store CPU-generated results
+ * \param num - Number of random 64-bit values to generate
+ *
+ * \return
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
+ *     a previous kernel launch \n
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
+ *    not a multiple of the quasirandom dimension \n
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
+ * - CURAND_STATUS_TYPE_ERROR if the generator is not a 64 bit quasirandom generator\n
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGenerateLongLong(curandGenerator_t generator, unsigned long long *outputPtr, size_t num);
+/**
+ * \brief Generate uniformly distributed floats.
+ *
+ * Use \p generator to generate \p num float results into the device memory at
+ * \p outputPtr.  The device memory must have been previously allocated and be
+ * large enough to hold all the results.  Launches are done with the stream
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
+ *
+ * Results are 32-bit floating point values between \p 0.0f and \p 1.0f,
+ * excluding \p 0.0f and including \p 1.0f.
+ *
+ * \param generator - Generator to use
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
+ *                 Pointer to host memory to store CPU-generated results
+ * \param num - Number of floats to generate
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
+ *    a previous kernel launch \n
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
+ *    not a multiple of the quasirandom dimension \n
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGenerateUniform(curandGenerator_t generator, float *outputPtr, size_t num);
+/**
+ * \brief Generate uniformly distributed doubles.
+ *
+ * Use \p generator to generate \p num double results into the device memory at
+ * \p outputPtr.  The device memory must have been previously allocated and be
+ * large enough to hold all the results.  Launches are done with the stream
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
+ *
+ * Results are 64-bit double precision floating point values between
+ * \p 0.0 and \p 1.0, excluding \p 0.0 and including \p 1.0.
+ *
+ * \param generator - Generator to use
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
+ *                 Pointer to host memory to store CPU-generated results
+ * \param num - Number of doubles to generate
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
+ *    a previous kernel launch \n
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
+ *    not a multiple of the quasirandom dimension \n
+ * - CURAND_STATUS_DOUBLE_PRECISION_REQUIRED if the GPU does not support double precision \n
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGenerateUniformDouble(curandGenerator_t generator, double *outputPtr, size_t num);
+/**
+ * \brief Generate normally distributed doubles.
+ *
+ * Use \p generator to generate \p n float results into the device memory at
+ * \p outputPtr.  The device memory must have been previously allocated and be
+ * large enough to hold all the results.  Launches are done with the stream
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
+ *
+ * Results are 32-bit floating point values with mean \p mean and standard
+ * deviation \p stddev.
+ *
+ * Normally distributed results are generated from pseudorandom generators
+ * with a Box-Muller transform, and so require \p n to be even.
+ * Quasirandom generators use an inverse cumulative distribution
+ * function to preserve dimensionality.
+ *
+ * There may be slight numerical differences between results generated
+ * on the GPU with generators created with ::curandCreateGenerator()
+ * and results calculated on the CPU with generators created with
+ * ::curandCreateGeneratorHost().  These differences arise because of
+ * differences in results for transcendental functions.  In addition,
+ * future versions of CURAND may use newer versions of the CUDA math
+ * library, so different versions of CURAND may give slightly different
+ * numerical values.
+ *
+ * \param generator - Generator to use
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
+ *                 Pointer to host memory to store CPU-generated results
+ * \param n - Number of floats to generate
+ * \param mean - Mean of normal distribution
+ * \param stddev - Standard deviation of normal distribution
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
+ *    a previous kernel launch \n
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
+ *    not a multiple of the quasirandom dimension, or is not a multiple
+ *    of two for pseudorandom generators \n
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGenerateNormal(curandGenerator_t generator, float *outputPtr,
+                     size_t n, float mean, float stddev);
+/**
+ * \brief Generate normally distributed doubles.
+ *
+ * Use \p generator to generate \p n double results into the device memory at
+ * \p outputPtr.  The device memory must have been previously allocated and be
+ * large enough to hold all the results.  Launches are done with the stream
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
+ *
+ * Results are 64-bit floating point values with mean \p mean and standard
+ * deviation \p stddev.
+ *
+ * Normally distributed results are generated from pseudorandom generators
+ * with a Box-Muller transform, and so require \p n to be even.
+ * Quasirandom generators use an inverse cumulative distribution
+ * function to preserve dimensionality.
+ *
+ * There may be slight numerical differences between results generated
+ * on the GPU with generators created with ::curandCreateGenerator()
+ * and results calculated on the CPU with generators created with
+ * ::curandCreateGeneratorHost().  These differences arise because of
+ * differences in results for transcendental functions.  In addition,
+ * future versions of CURAND may use newer versions of the CUDA math
+ * library, so different versions of CURAND may give slightly different
+ * numerical values.
+ *
+ * \param generator - Generator to use
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
+ *                 Pointer to host memory to store CPU-generated results
+ * \param n - Number of doubles to generate
+ * \param mean - Mean of normal distribution
+ * \param stddev - Standard deviation of normal distribution
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
+ *    a previous kernel launch \n
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
+ *    not a multiple of the quasirandom dimension, or is not a multiple
+ *    of two for pseudorandom generators \n
+ * - CURAND_STATUS_DOUBLE_PRECISION_REQUIRED if the GPU does not support double precision \n
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGenerateNormalDouble(curandGenerator_t generator, double *outputPtr,
+                     size_t n, double mean, double stddev);
+/**
+ * \brief Generate log-normally distributed floats.
+ *
+ * Use \p generator to generate \p n float results into the device memory at
+ * \p outputPtr.  The device memory must have been previously allocated and be
+ * large enough to hold all the results.  Launches are done with the stream
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
+ *
+ * Results are 32-bit floating point values with log-normal distribution based on
+ * an associated normal distribution with mean \p mean and standard deviation \p stddev.
+ *
+ * Normally distributed results are generated from pseudorandom generators
+ * with a Box-Muller transform, and so require \p n to be even.
+ * Quasirandom generators use an inverse cumulative distribution
+ * function to preserve dimensionality.
+ * The normally distributed results are transformed into log-normal distribution.
+ *
+ * There may be slight numerical differences between results generated
+ * on the GPU with generators created with ::curandCreateGenerator()
+ * and results calculated on the CPU with generators created with
+ * ::curandCreateGeneratorHost().  These differences arise because of
+ * differences in results for transcendental functions.  In addition,
+ * future versions of CURAND may use newer versions of the CUDA math
+ * library, so different versions of CURAND may give slightly different
+ * numerical values.
+ *
+ * \param generator - Generator to use
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
+ *                 Pointer to host memory to store CPU-generated results
+ * \param n - Number of floats to generate
+ * \param mean - Mean of associated normal distribution
+ * \param stddev - Standard deviation of associated normal distribution
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
+ *    a previous kernel launch \n
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
+ *    not a multiple of the quasirandom dimension, or is not a multiple
+ *    of two for pseudorandom generators \n
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGenerateLogNormal(curandGenerator_t generator, float *outputPtr,
+                     size_t n, float mean, float stddev);
+/**
+ * \brief Generate log-normally distributed doubles.
+ *
+ * Use \p generator to generate \p n double results into the device memory at
+ * \p outputPtr.  The device memory must have been previously allocated and be
+ * large enough to hold all the results.  Launches are done with the stream
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
+ *
+ * Results are 64-bit floating point values with log-normal distribution based on
+ * an associated normal distribution with mean \p mean and standard deviation \p stddev.
+ *
+ * Normally distributed results are generated from pseudorandom generators
+ * with a Box-Muller transform, and so require \p n to be even.
+ * Quasirandom generators use an inverse cumulative distribution
+ * function to preserve dimensionality.
+ * The normally distributed results are transformed into log-normal distribution.
+ *
+ * There may be slight numerical differences between results generated
+ * on the GPU with generators created with ::curandCreateGenerator()
+ * and results calculated on the CPU with generators created with
+ * ::curandCreateGeneratorHost().  These differences arise because of
+ * differences in results for transcendental functions.  In addition,
+ * future versions of CURAND may use newer versions of the CUDA math
+ * library, so different versions of CURAND may give slightly different
+ * numerical values.
+ *
+ * \param generator - Generator to use
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
+ *                 Pointer to host memory to store CPU-generated results
+ * \param n - Number of doubles to generate
+ * \param mean - Mean of normal distribution
+ * \param stddev - Standard deviation of normal distribution
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
+ *    a previous kernel launch \n
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
+ *    not a multiple of the quasirandom dimension, or is not a multiple
+ *    of two for pseudorandom generators \n
+ * - CURAND_STATUS_DOUBLE_PRECISION_REQUIRED if the GPU does not support double precision \n
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGenerateLogNormalDouble(curandGenerator_t generator, double *outputPtr,
+                     size_t n, double mean, double stddev);
+/**
+ * \brief Construct the histogram array for a Poisson distribution.
+ *
+ * Construct the histogram array for the Poisson distribution with lambda \p lambda.
+ * For lambda greater than 2000, an approximation with a normal distribution is used.
+ *
+ * \param lambda - lambda for the Poisson distribution
+ *
+ *
+ * \param discrete_distribution - pointer to the histogram in device memory
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
+ * - CURAND_STATUS_DOUBLE_PRECISION_REQUIRED if the GPU does not support double precision \n
+ * - CURAND_STATUS_INITIALIZATION_FAILED if there was a problem setting up the GPU \n
+ * - CURAND_STATUS_NOT_INITIALIZED if the distribution pointer was null \n
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
+ *    a previous kernel launch \n
+ * - CURAND_STATUS_OUT_OF_RANGE if lambda is non-positive or greater than 400,000 \n
+ * - CURAND_STATUS_SUCCESS if the histogram was generated successfully \n
+ */
+curandStatus_t CURANDAPI
+curandCreatePoissonDistribution(double lambda, curandDiscreteDistribution_t *discrete_distribution);
+/**
+ * \brief Destroy the histogram array for a discrete distribution (e.g. Poisson).
+ *
+ * Destroy the histogram array for a discrete distribution created by curandCreatePoissonDistribution.
+ *
+ * \param discrete_distribution - pointer to device memory where the histogram is stored
+ *
+ * \return
+ * - CURAND_STATUS_NOT_INITIALIZED if the histogram was never created \n
+ * - CURAND_STATUS_SUCCESS if the histogram was destroyed successfully \n
+ */
+curandStatus_t CURANDAPI
+curandDestroyDistribution(curandDiscreteDistribution_t discrete_distribution);
+/**
+ * \brief Generate Poisson-distributed unsigned ints.
+ *
+ * Use \p generator to generate \p n unsigned int results into device memory at
+ * \p outputPtr.  The device memory must have been previously allocated and must be
+ * large enough to hold all the results.  Launches are done with the stream
+ * set using ::curandSetStream(), or the null stream if no stream has been set.
+ *
+ * Results are 32-bit unsigned int point values with Poisson distribution, with lambda \p lambda.
+ *
+ * \param generator - Generator to use
+ * \param outputPtr - Pointer to device memory to store CUDA-generated results, or
+ *                 Pointer to host memory to store CPU-generated results
+ * \param n - Number of unsigned ints to generate
+ * \param lambda - lambda for the Poisson distribution
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
+ *    a previous kernel launch \n
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
+ * - CURAND_STATUS_LENGTH_NOT_MULTIPLE if the number of output samples is
+ *    not a multiple of the quasirandom dimension\n
+ * - CURAND_STATUS_DOUBLE_PRECISION_REQUIRED if the GPU or sm does not support double precision \n
+ * - CURAND_STATUS_OUT_OF_RANGE if lambda is non-positive or greater than 400,000 \n
+ * - CURAND_STATUS_SUCCESS if the results were generated successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGeneratePoisson(curandGenerator_t generator, unsigned int *outputPtr,
+                     size_t n, double lambda);
+// just for internal usage
+curandStatus_t CURANDAPI
+curandGeneratePoissonMethod(curandGenerator_t generator, unsigned int *outputPtr,
+                     size_t n, double lambda, curandMethod_t method);
+curandStatus_t CURANDAPI
+curandGenerateBinomial(curandGenerator_t generator, unsigned int *outputPtr,
+                       size_t num, unsigned int n, double p);
+// just for internal usage
+curandStatus_t CURANDAPI
+curandGenerateBinomialMethod(curandGenerator_t generator,
+                             unsigned int *outputPtr,
+                             size_t num, unsigned int n, double p,
+                             curandMethod_t method);
+/**
+ * \brief Setup starting states.
+ *
+ * Generate the starting state of the generator.  This function is
+ * automatically called by generation functions such as
+ * ::curandGenerate() and ::curandGenerateUniform().
+ * It can be called manually for performance testing reasons to separate
+ * timings for starting state generation and random number generation.
+ *
+ * \param generator - Generator to update
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if memory could not be allocated \n
+ * - CURAND_STATUS_NOT_INITIALIZED if the generator was never created \n
+ * - CURAND_STATUS_PREEXISTING_FAILURE if there was an existing error from
+ *     a previous kernel launch \n
+ * - CURAND_STATUS_LAUNCH_FAILURE if the kernel launch failed for any reason \n
+ * - CURAND_STATUS_SUCCESS if the seeds were generated successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGenerateSeeds(curandGenerator_t generator);
+/**
+ * \brief Get direction vectors for 32-bit quasirandom number generation.
+ *
+ * Get a pointer to an array of direction vectors that can be used
+ * for quasirandom number generation.  The resulting pointer will
+ * reference an array of direction vectors in host memory.
+ *
+ * The array contains vectors for many dimensions.  Each dimension
+ * has 32 vectors.  Each individual vector is an unsigned int.
+ *
+ * Legal values for \p set are:
+ * - CURAND_DIRECTION_VECTORS_32_JOEKUO6 (20,000 dimensions)
+ * - CURAND_SCRAMBLED_DIRECTION_VECTORS_32_JOEKUO6 (20,000 dimensions)
+ *
+ * \param vectors - Address of pointer in which to return direction vectors
+ * \param set - Which set of direction vectors to use
+ *
+ * \return
+ * - CURAND_STATUS_OUT_OF_RANGE if the choice of set is invalid \n
+ * - CURAND_STATUS_SUCCESS if the pointer was set successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGetDirectionVectors32(curandDirectionVectors32_t *vectors[], curandDirectionVectorSet_t set);
+/**
+ * \brief Get scramble constants for 32-bit scrambled Sobol' .
+ *
+ * Get a pointer to an array of scramble constants that can be used
+ * for quasirandom number generation.  The resulting pointer will
+ * reference an array of unsinged ints in host memory.
+ *
+ * The array contains constants for many dimensions.  Each dimension
+ * has a single unsigned int constant.
+ *
+ * \param constants - Address of pointer in which to return scramble constants
+ *
+ * \return
+ * - CURAND_STATUS_SUCCESS if the pointer was set successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGetScrambleConstants32(unsigned int * * constants);
+/**
+ * \brief Get direction vectors for 64-bit quasirandom number generation.
+ *
+ * Get a pointer to an array of direction vectors that can be used
+ * for quasirandom number generation.  The resulting pointer will
+ * reference an array of direction vectors in host memory.
+ *
+ * The array contains vectors for many dimensions.  Each dimension
+ * has 64 vectors.  Each individual vector is an unsigned long long.
+ *
+ * Legal values for \p set are:
+ * - CURAND_DIRECTION_VECTORS_64_JOEKUO6 (20,000 dimensions)
+ * - CURAND_SCRAMBLED_DIRECTION_VECTORS_64_JOEKUO6 (20,000 dimensions)
+ *
+ * \param vectors - Address of pointer in which to return direction vectors
+ * \param set - Which set of direction vectors to use
+ *
+ * \return
+ * - CURAND_STATUS_OUT_OF_RANGE if the choice of set is invalid \n
+ * - CURAND_STATUS_SUCCESS if the pointer was set successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGetDirectionVectors64(curandDirectionVectors64_t *vectors[], curandDirectionVectorSet_t set);
+/**
+ * \brief Get scramble constants for 64-bit scrambled Sobol' .
+ *
+ * Get a pointer to an array of scramble constants that can be used
+ * for quasirandom number generation.  The resulting pointer will
+ * reference an array of unsinged long longs in host memory.
+ *
+ * The array contains constants for many dimensions.  Each dimension
+ * has a single unsigned long long constant.
+ *
+ * \param constants - Address of pointer in which to return scramble constants
+ *
+ * \return
+ * - CURAND_STATUS_SUCCESS if the pointer was set successfully \n
+ */
+curandStatus_t CURANDAPI
+curandGetScrambleConstants64(unsigned long long * * constants);
+/** @} */
+#endif // __CUDACC_RTC__
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#endif /* !defined(CURAND_H_) */

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete.h ADDED Viewed

	@@ -0,0 +1,87 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#if !defined(CURANDDISCRETE_H_)
+#define CURANDDISCRETE_H_
+struct curandDistributionShift_st {
+    curandDistribution_t probability;
+    curandDistribution_t host_probability;
+    unsigned int shift;
+    unsigned int length;
+    unsigned int host_gen;
+};
+struct curandHistogramM2_st {
+    curandHistogramM2V_t V;
+    curandHistogramM2V_t host_V;
+    curandHistogramM2K_t K;
+    curandHistogramM2K_t host_K;
+    unsigned int host_gen;
+};
+struct curandDistributionM2Shift_st {
+    curandHistogramM2_t histogram;
+    curandHistogramM2_t host_histogram;
+    unsigned int shift;
+    unsigned int length;
+    unsigned int host_gen;
+};
+struct curandDiscreteDistribution_st {
+    curandDiscreteDistribution_t self_host_ptr;
+    curandDistributionM2Shift_t M2;
+    curandDistributionM2Shift_t host_M2;
+    double stddev;
+    double mean;
+    curandMethod_t method;
+    unsigned int host_gen;
+};
+#endif // !defined(CURANDDISCRETE_H_)

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_discrete2.h ADDED Viewed

	@@ -0,0 +1,253 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#if !defined(CURAND_DISCRETE_H_)
+#define CURAND_DISCRETE_H_
+/**
+ * \defgroup DEVICE Device API
+ *
+ * @{
+ */
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif // __CUDACC_RTC__
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_philox4x32_x.h"
+template <typename T>
+QUALIFIERS unsigned int _curand_discrete(T x, curandDiscreteDistribution_t discrete_distribution){
+    if (discrete_distribution->method == CURAND_M2){
+        return _curand_M2_double(x, discrete_distribution->M2);
+    }
+    return (unsigned int)((discrete_distribution->stddev * _curand_normal_icdf_double(x)) + discrete_distribution->mean + 0.5);
+}
+template <typename STATE>
+QUALIFIERS unsigned int curand__discrete(STATE state, curandDiscreteDistribution_t discrete_distribution){
+    if (discrete_distribution->method == CURAND_M2){
+        return curand_M2_double(state, discrete_distribution->M2);
+    }
+    return (unsigned int)((discrete_distribution->stddev * curand_normal_double(state)) + discrete_distribution->mean + 0.5); //Round to nearest
+}
+template <typename STATE>
+QUALIFIERS uint4 curand__discrete4(STATE state, curandDiscreteDistribution_t discrete_distribution){
+    if (discrete_distribution->method == CURAND_M2){
+        return curand_M2_double4(state, discrete_distribution->M2);
+    }
+    double4 _res;
+    uint4 result;
+    _res = curand_normal4_double(state);
+    result.x = (unsigned int)((discrete_distribution->stddev * _res.x) + discrete_distribution->mean + 0.5); //Round to nearest
+    result.y = (unsigned int)((discrete_distribution->stddev * _res.y) + discrete_distribution->mean + 0.5); //Round to nearest
+    result.z = (unsigned int)((discrete_distribution->stddev * _res.z) + discrete_distribution->mean + 0.5); //Round to nearest
+    result.w = (unsigned int)((discrete_distribution->stddev * _res.w) + discrete_distribution->mean + 0.5); //Round to nearest
+    return result;
+}
+/*
+ * \brief Return a discrete distributed unsigned int from a XORWOW generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the XORWOW generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateXORWOW_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+/*
+ * \brief Return a discrete distributed unsigned int from a Philox4_32_10 generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+/*
+ * \brief Return four discrete distributed unsigned ints from a Philox4_32_10 generator.
+ *
+ * Return four single discrete distributed unsigned ints derived from a
+ * distribution defined by \p discrete_distribution from the Philox4_32_10 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS uint4 curand_discrete4(curandStatePhilox4_32_10_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete4(state, discrete_distribution);
+}
+/*
+ * \brief Return a discrete distributed unsigned int from a MRG32k3a generator.
+ *
+ * Re turn a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the MRG32k3a generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateMRG32k3a_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+/*
+ * \brief Return a discrete distributed unsigned int from a MTGP32 generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the MTGP32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateMtgp32_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+/*
+ * \brief Return a discrete distributed unsigned int from a Sobol32 generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+/*
+ * \brief Return a discrete distributed unsigned int from a scrambled Sobol32 generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol32_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+/*
+ * \brief Return a discrete distributed unsigned int from a Sobol64 generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+/*
+ * \brief Return a discrete distributed unsigned int from a scrambled Sobol64 generator.
+ *
+ * Return a single discrete distributed unsigned int derived from a
+ * distribution defined by \p discrete_distribution from the scrambled Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param discrete_distribution - ancillary structure for discrete distribution
+ *
+ * \return unsigned int distributed by distribution defined by \p discrete_distribution.
+ */
+QUALIFIERS unsigned int curand_discrete(curandStateScrambledSobol64_t *state, curandDiscreteDistribution_t discrete_distribution)
+{
+    return curand__discrete(state, discrete_distribution);
+}
+#endif // !defined(CURAND_DISCRETE_H_)

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_globals.h ADDED Viewed

	@@ -0,0 +1,93 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef CURAND_GLOBALS_H
+#define CURAND_GLOBALS_H
+#define MAX_XOR_N (5)
+#define SKIPAHEAD_BLOCKSIZE (4)
+#define SKIPAHEAD_MASK ((1<<SKIPAHEAD_BLOCKSIZE)-1)
+#define CURAND_2POW32 (4294967296.f)
+#define CURAND_2POW32_DOUBLE (4294967296.)
+#define CURAND_2POW32_INV (2.3283064e-10f)
+#define CURAND_2POW32_INV_DOUBLE (2.3283064365386963e-10)
+#define CURAND_2POW53_INV_DOUBLE (1.1102230246251565e-16)
+#define CURAND_2POW32_INV_2PI (2.3283064e-10f * 6.2831855f)
+#define CURAND_2PI (6.2831855f)
+#define CURAND_2POW53_INV_2PI_DOUBLE (1.1102230246251565e-16 * 6.2831853071795860)
+#define CURAND_PI_DOUBLE  (3.1415926535897932)
+#define CURAND_2PI_DOUBLE (6.2831853071795860)
+#define CURAND_SQRT2 (-1.4142135f)
+#define CURAND_SQRT2_DOUBLE (-1.4142135623730951)
+#define SOBOL64_ITR_BINARY_DIVIDE 2
+#define SOBOL_M2_BINARY_DIVIDE 10
+#define MTGP32_M2_BINARY_DIVIDE 32
+#define MAX_LAMBDA 400000
+#define MIN_GAUSS_LAMBDA 2000
+struct normal_args_st {
+    float mean;
+    float stddev;
+};
+typedef struct normal_args_st normal_args_t;
+struct normal_args_double_st {
+    double mean;
+    double stddev;
+};
+typedef struct normal_args_double_st normal_args_double_t;
+#endif

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_kernel.h ADDED Viewed

	@@ -0,0 +1,1677 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#if !defined(CURAND_KERNEL_H_)
+#define CURAND_KERNEL_H_
+/**
+ * \defgroup DEVICE Device API
+ *
+ * @{
+ */
+#if !defined(QUALIFIERS)
+#define QUALIFIERS static __forceinline__ __device__
+#endif
+/* To prevent unused parameter warnings */
+#if !defined(GCC_UNUSED_PARAMETER)
+#if defined(__GNUC__)
+#define GCC_UNUSED_PARAMETER __attribute__((unused))
+#else
+#define GCC_UNUSED_PARAMETER
+#endif /* defined(__GNUC__) */
+#endif /* !defined(GCC_UNUSED_PARAMETER) */
+#include <nv/target>
+#ifdef __CUDACC_RTC__
+#define CURAND_DETAIL_USE_CUDA_STL
+#endif
+#if __cplusplus >= 201103L
+#   ifdef CURAND_DETAIL_USE_CUDA_STL
+#       define CURAND_STD cuda::std
+#       include <cuda/std/type_traits>
+#   else
+#       define CURAND_STD std
+#       include <type_traits>
+#   endif // CURAND_DETAIL_USE_CUDA_STL
+#else
+// To support C++03 compilation
+#   define CURAND_STD curand_detail
+namespace curand_detail {
+    template<bool B, class T = void>
+    struct enable_if {};
+    template<class T>
+    struct enable_if<true, T> { typedef T type; };
+    template<class T, class U>
+    struct is_same { static const bool value = false; };
+    template<class T>
+    struct is_same<T, T> { static const bool value = true; };
+} // namespace curand_detail
+#endif // __cplusplus >= 201103L
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif // __CUDACC_RTC__
+#include "curand.h"
+#include "curand_discrete.h"
+#include "curand_precalc.h"
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_philox4x32_x.h"
+#include "curand_globals.h"
+/* Test RNG */
+/* This generator uses the formula:
+   x_n = x_(n-1) + 1 mod 2^32
+   x_0 = (unsigned int)seed * 3
+   Subsequences are spaced 31337 steps apart.
+*/
+struct curandStateTest {
+    unsigned int v;
+};
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateTest curandStateTest_t;
+/** \endcond */
+/* XORSHIFT FAMILY RNGs */
+/* These generators are a family proposed by Marsaglia.  They keep state
+   in 32 bit chunks, then use repeated shift and xor operations to scramble
+   the bits.  The following generators are a combination of a simple Weyl
+   generator with an N variable XORSHIFT generator.
+*/
+/* XORSHIFT RNG */
+/* This generator uses the xorwow formula of
+www.jstatsoft.org/v08/i14/paper page 5
+Has period 2^192 - 2^32.
+*/
+/**
+ * CURAND XORWOW state
+ */
+struct curandStateXORWOW;
+/*
+ * Implementation details not in reference documentation */
+struct curandStateXORWOW {
+    unsigned int d, v[5];
+    int boxmuller_flag;
+    int boxmuller_flag_double;
+    float boxmuller_extra;
+    double boxmuller_extra_double;
+};
+/*
+ * CURAND XORWOW state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateXORWOW curandStateXORWOW_t;
+#define EXTRA_FLAG_NORMAL         0x00000001
+#define EXTRA_FLAG_LOG_NORMAL     0x00000002
+/** \endcond */
+/* Combined Multiple Recursive Generators */
+/* These generators are a family proposed by L'Ecuyer.  They keep state
+   in sets of doubles, then use repeated modular arithmetic multiply operations
+   to scramble the bits in each set, and combine the result.
+*/
+/* MRG32k3a RNG */
+/* This generator uses the MRG32k3A formula of
+http://www.iro.umontreal.ca/~lecuyer/myftp/streams00/c++/streams4.pdf
+Has period 2^191.
+*/
+/* moduli for the recursions */
+/** \cond UNHIDE_DEFINES */
+#define MRG32K3A_MOD1 4294967087.
+#define MRG32K3A_MOD2 4294944443.
+/* Constants used in generation */
+#define MRG32K3A_A12  1403580.
+#define MRG32K3A_A13N 810728.
+#define MRG32K3A_A21  527612.
+#define MRG32K3A_A23N 1370589.
+#define MRG32K3A_NORM (2.3283065498378288e-10)
+//
+// #define MRG32K3A_BITS_NORM ((double)((POW32_DOUBLE-1.0)/MOD1))
+//  above constant, used verbatim, rounds differently on some host systems.
+#define MRG32K3A_BITS_NORM 1.000000048662
+/** \endcond */
+/**
+ * CURAND MRG32K3A state
+ */
+struct curandStateMRG32k3a;
+/* Implementation details not in reference documentation */
+struct curandStateMRG32k3a {
+    unsigned int s1[3];
+    unsigned int s2[3];
+    int boxmuller_flag;
+    int boxmuller_flag_double;
+    float boxmuller_extra;
+    double boxmuller_extra_double;
+};
+/*
+ * CURAND MRG32K3A state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateMRG32k3a curandStateMRG32k3a_t;
+/** \endcond */
+/* SOBOL QRNG */
+/**
+ * CURAND Sobol32 state
+ */
+struct curandStateSobol32;
+/* Implementation details not in reference documentation */
+struct curandStateSobol32 {
+    unsigned int i, x, c;
+    unsigned int direction_vectors[32];
+};
+/*
+ * CURAND Sobol32 state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateSobol32 curandStateSobol32_t;
+/** \endcond */
+/**
+ * CURAND Scrambled Sobol32 state
+ */
+struct curandStateScrambledSobol32;
+/* Implementation details not in reference documentation */
+struct curandStateScrambledSobol32 {
+    unsigned int i, x, c;
+    unsigned int direction_vectors[32];
+};
+/*
+ * CURAND Scrambled Sobol32 state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateScrambledSobol32 curandStateScrambledSobol32_t;
+/** \endcond */
+/**
+ * CURAND Sobol64 state
+ */
+struct curandStateSobol64;
+/* Implementation details not in reference documentation */
+struct curandStateSobol64 {
+    unsigned long long i, x, c;
+    unsigned long long direction_vectors[64];
+};
+/*
+ * CURAND Sobol64 state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateSobol64 curandStateSobol64_t;
+/** \endcond */
+/**
+ * CURAND Scrambled Sobol64 state
+ */
+struct curandStateScrambledSobol64;
+/* Implementation details not in reference documentation */
+struct curandStateScrambledSobol64 {
+    unsigned long long i, x, c;
+    unsigned long long direction_vectors[64];
+};
+/*
+ * CURAND Scrambled Sobol64 state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateScrambledSobol64 curandStateScrambledSobol64_t;
+/** \endcond */
+/*
+ * Default RNG
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateXORWOW curandState_t;
+typedef struct curandStateXORWOW curandState;
+/** \endcond */
+/****************************************************************************/
+/* Utility functions needed by RNGs */
+/****************************************************************************/
+/** \cond UNHIDE_UTILITIES */
+/*
+   multiply vector by matrix, store in result
+   matrix is n x n, measured in 32 bit units
+   matrix is stored in row major order
+   vector and result cannot be same pointer
+*/
+template<int N>
+QUALIFIERS void __curand_matvec_inplace(unsigned int *vector, unsigned int *matrix)
+{
+    unsigned int result[N] = { 0 };
+    for(int i = 0; i < N; i++) {
+        #ifdef __CUDA_ARCH__
+        #pragma unroll 16
+        #endif
+        for(int j = 0; j < 32; j++) {
+            if(vector[i] & (1 << j)) {
+                for(int k = 0; k < N; k++) {
+                    result[k] ^= matrix[N * (i * 32 + j) + k];
+                }
+            }
+        }
+    }
+    for(int i = 0; i < N; i++) {
+        vector[i] = result[i];
+    }
+}
+QUALIFIERS void __curand_matvec(unsigned int *vector, unsigned int *matrix,
+                                unsigned int *result, int n)
+{
+    for(int i = 0; i < n; i++) {
+        result[i] = 0;
+    }
+    for(int i = 0; i < n; i++) {
+        for(int j = 0; j < 32; j++) {
+            if(vector[i] & (1 << j)) {
+                for(int k = 0; k < n; k++) {
+                    result[k] ^= matrix[n * (i * 32 + j) + k];
+                }
+            }
+        }
+    }
+}
+/* generate identity matrix */
+QUALIFIERS void __curand_matidentity(unsigned int *matrix, int n)
+{
+    int r;
+    for(int i = 0; i < n * 32; i++) {
+        for(int j = 0; j < n; j++) {
+            r = i & 31;
+            if(i / 32 == j) {
+                matrix[i * n + j] = (1 << r);
+            } else {
+                matrix[i * n + j] = 0;
+            }
+        }
+    }
+}
+/* multiply matrixA by matrixB, store back in matrixA
+   matrixA and matrixB must not be same matrix */
+QUALIFIERS void __curand_matmat(unsigned int *matrixA, unsigned int *matrixB, int n)
+{
+    unsigned int result[MAX_XOR_N];
+    for(int i = 0; i < n * 32; i++) {
+        __curand_matvec(matrixA + i * n, matrixB, result, n);
+        for(int j = 0; j < n; j++) {
+            matrixA[i * n + j] = result[j];
+        }
+    }
+}
+/* copy vectorA to vector */
+QUALIFIERS void __curand_veccopy(unsigned int *vector, unsigned int *vectorA, int n)
+{
+    for(int i = 0; i < n; i++) {
+        vector[i] = vectorA[i];
+    }
+}
+/* copy matrixA to matrix */
+QUALIFIERS void __curand_matcopy(unsigned int *matrix, unsigned int *matrixA, int n)
+{
+    for(int i = 0; i < n * n * 32; i++) {
+        matrix[i] = matrixA[i];
+    }
+}
+/* compute matrixA to power p, store result in matrix */
+QUALIFIERS void __curand_matpow(unsigned int *matrix, unsigned int *matrixA,
+                                unsigned long long p, int n)
+{
+    unsigned int matrixR[MAX_XOR_N * MAX_XOR_N * 32];
+    unsigned int matrixS[MAX_XOR_N * MAX_XOR_N * 32];
+    __curand_matidentity(matrix, n);
+    __curand_matcopy(matrixR, matrixA, n);
+    while(p) {
+        if(p & 1) {
+            __curand_matmat(matrix, matrixR, n);
+        }
+        __curand_matcopy(matrixS, matrixR, n);
+        __curand_matmat(matrixR, matrixS, n);
+        p >>= 1;
+    }
+}
+/****************************************************************************/
+/* Utility functions needed by MRG32k3a RNG                                 */
+/* Matrix operations modulo some integer less than 2**32, done in           */
+/* double precision floating point, with care not to overflow 53 bits       */
+/****************************************************************************/
+/* return i mod m.                                                          */
+/* assumes i and m are integers represented accurately in doubles           */
+QUALIFIERS double curand_MRGmod(double i, double m)
+{
+    double quo;
+    double rem;
+    quo = floor(i/m);
+    rem = i - (quo*m);
+    if (rem < 0.0) rem += m;
+    return rem;
+}
+/* Multiplication modulo m. Inputs i and j less than 2**32                  */
+/* Ensure intermediate results do not exceed 2**53                          */
+QUALIFIERS double curand_MRGmodMul(double i, double j, double m)
+{
+    double tempHi;
+    double tempLo;
+    tempHi = floor(i/131072.0);
+    tempLo = i - (tempHi*131072.0);
+    tempLo = curand_MRGmod( curand_MRGmod( (tempHi * j), m) * 131072.0 + curand_MRGmod(tempLo * j, m),m);
+    if (tempLo < 0.0) tempLo += m;
+    return tempLo;
+}
+/* multiply 3 by 3 matrices of doubles, modulo m                            */
+QUALIFIERS void curand_MRGmatMul3x3(unsigned int i1[][3],unsigned int i2[][3],unsigned int o[][3],double m)
+{
+    int i,j;
+    double temp[3][3];
+    for (i=0; i<3; i++){
+        for (j=0; j<3; j++){
+            temp[i][j] = ( curand_MRGmodMul(i1[i][0], i2[0][j], m) +
+                           curand_MRGmodMul(i1[i][1], i2[1][j], m) +
+                           curand_MRGmodMul(i1[i][2], i2[2][j], m));
+            temp[i][j] = curand_MRGmod( temp[i][j], m );
+        }
+    }
+    for (i=0; i<3; i++){
+        for (j=0; j<3; j++){
+            o[i][j] = (unsigned int)temp[i][j];
+        }
+    }
+}
+/* multiply 3 by 3 matrix times 3 by 1 vector of doubles, modulo m          */
+QUALIFIERS void curand_MRGmatVecMul3x3( unsigned int i[][3], unsigned int v[], double m)
+{
+    int k;
+    double t[3];
+    for (k = 0; k < 3; k++) {
+        t[k] = ( curand_MRGmodMul(i[k][0], v[0], m) +
+                 curand_MRGmodMul(i[k][1], v[1], m) +
+                 curand_MRGmodMul(i[k][2], v[2], m) );
+        t[k] = curand_MRGmod( t[k], m );
+    }
+    for (k = 0; k < 3; k++) {
+        v[k] = (unsigned int)t[k];
+    }
+}
+/* raise a 3 by 3 matrix of doubles to a 64 bit integer power pow, modulo m */
+/* input is index zero of an array of 3 by 3 matrices m,                    */
+/* each m = m[0]**(2**index)                                                */
+QUALIFIERS void curand_MRGmatPow3x3( unsigned int in[][3][3], unsigned int o[][3], double m, unsigned long long pow )
+{
+    int i,j;
+    for ( i = 0; i < 3; i++ ) {
+        for ( j = 0; j < 3; j++ ) {
+            o[i][j] = 0;
+            if ( i == j ) o[i][j] = 1;
+        }
+    }
+    i = 0;
+    curand_MRGmatVecMul3x3(o,o[0],m);
+    while (pow) {
+        if ( pow & 1ll ) {
+             curand_MRGmatMul3x3(in[i], o, o, m);
+        }
+        i++;
+        pow >>= 1;
+    }
+}
+/* raise a 3 by 3 matrix of doubles to the power                            */
+/* 2 to the power (pow modulo 191), modulo m                                */
+QUALIFIERS void curnand_MRGmatPow2Pow3x3( double in[][3], double o[][3], double m, unsigned long pow )
+{
+    unsigned int temp[3][3];
+    int i,j;
+    pow = pow % 191;
+    for ( i = 0; i < 3; i++ ) {
+        for ( j = 0; j < 3; j++ ) {
+            temp[i][j] = (unsigned int)in[i][j];
+        }
+    }
+    while (pow) {
+        curand_MRGmatMul3x3(temp, temp, temp, m);
+        pow--;
+    }
+    for ( i = 0; i < 3; i++ ) {
+        for ( j = 0; j < 3; j++ ) {
+            o[i][j] = temp[i][j];
+        }
+    }
+}
+/** \endcond */
+/****************************************************************************/
+/* Kernel implementations of RNGs                                           */
+/****************************************************************************/
+/* Test RNG */
+QUALIFIERS void curand_init(unsigned long long seed,
+                                            unsigned long long subsequence,
+                                            unsigned long long offset,
+                                            curandStateTest_t *state)
+{
+    state->v = (unsigned int)(seed * 3) + (unsigned int)(subsequence * 31337) + \
+                     (unsigned int)offset;
+}
+QUALIFIERS unsigned int curand(curandStateTest_t *state)
+{
+    unsigned int r = state->v++;
+    return r;
+}
+QUALIFIERS void skipahead(unsigned long long n, curandStateTest_t *state)
+{
+    state->v += (unsigned int)n;
+}
+/* XORWOW RNG */
+template <typename T, int n>
+QUALIFIERS void __curand_generate_skipahead_matrix_xor(unsigned int matrix[])
+{
+    T state;
+    // Generate matrix that advances one step
+    // matrix has n * n * 32 32-bit elements
+    // solve for matrix by stepping single bit states
+    for(int i = 0; i < 32 * n; i++) {
+        state.d = 0;
+        for(int j = 0; j < n; j++) {
+            state.v[j] = 0;
+        }
+        state.v[i / 32] = (1 << (i & 31));
+        curand(&state);
+        for(int j = 0; j < n; j++) {
+            matrix[i * n + j] = state.v[j];
+        }
+    }
+}
+template <typename T, int n>
+QUALIFIERS void _skipahead_scratch(unsigned long long x, T *state, unsigned int *scratch)
+{
+    // unsigned int matrix[n * n * 32];
+    unsigned int *matrix = scratch;
+    // unsigned int matrixA[n * n * 32];
+    unsigned int *matrixA = scratch + (n * n * 32);
+    // unsigned int vector[n];
+    unsigned int *vector = scratch + (n * n * 32) + (n * n * 32);
+    // unsigned int result[n];
+    unsigned int *result = scratch + (n * n * 32) + (n * n * 32) + n;
+    unsigned long long p = x;
+    for(int i = 0; i < n; i++) {
+        vector[i] = state->v[i];
+    }
+    int matrix_num = 0;
+    while(p && (matrix_num < PRECALC_NUM_MATRICES - 1)) {
+        for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+            __curand_matvec(vector, precalc_xorwow_offset_matrix[matrix_num], result, n);
+,
+            __curand_matvec(vector, precalc_xorwow_offset_matrix_host[matrix_num], result, n);
+)
+            __curand_veccopy(vector, result, n);
+        }
+        p >>= PRECALC_BLOCK_SIZE;
+        matrix_num++;
+    }
+    if(p) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+        __curand_matcopy(matrix, precalc_xorwow_offset_matrix[PRECALC_NUM_MATRICES - 1], n);
+        __curand_matcopy(matrixA, precalc_xorwow_offset_matrix[PRECALC_NUM_MATRICES - 1], n);
+,
+        __curand_matcopy(matrix, precalc_xorwow_offset_matrix_host[PRECALC_NUM_MATRICES - 1], n);
+        __curand_matcopy(matrixA, precalc_xorwow_offset_matrix_host[PRECALC_NUM_MATRICES - 1], n);
+)
+    }
+    while(p) {
+        for(unsigned int t = 0; t < (p & SKIPAHEAD_MASK); t++) {
+            __curand_matvec(vector, matrixA, result, n);
+            __curand_veccopy(vector, result, n);
+        }
+        p >>= SKIPAHEAD_BLOCKSIZE;
+        if(p) {
+            for(int i = 0; i < SKIPAHEAD_BLOCKSIZE; i++) {
+                __curand_matmat(matrix, matrixA, n);
+                __curand_matcopy(matrixA, matrix, n);
+            }
+        }
+    }
+    for(int i = 0; i < n; i++) {
+        state->v[i] = vector[i];
+    }
+    state->d += 362437 * (unsigned int)x;
+}
+template <typename T, int n>
+QUALIFIERS void _skipahead_sequence_scratch(unsigned long long x, T *state, unsigned int *scratch)
+{
+    // unsigned int matrix[n * n * 32];
+    unsigned int *matrix = scratch;
+    // unsigned int matrixA[n * n * 32];
+    unsigned int *matrixA = scratch + (n * n * 32);
+    // unsigned int vector[n];
+    unsigned int *vector = scratch + (n * n * 32) + (n * n * 32);
+    // unsigned int result[n];
+    unsigned int *result = scratch + (n * n * 32) + (n * n * 32) + n;
+    unsigned long long p = x;
+    for(int i = 0; i < n; i++) {
+        vector[i] = state->v[i];
+    }
+    int matrix_num = 0;
+    while(p && matrix_num < PRECALC_NUM_MATRICES - 1) {
+        for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+            __curand_matvec(vector, precalc_xorwow_matrix[matrix_num], result, n);
+,
+            __curand_matvec(vector, precalc_xorwow_matrix_host[matrix_num], result, n);
+)
+            __curand_veccopy(vector, result, n);
+        }
+        p >>= PRECALC_BLOCK_SIZE;
+        matrix_num++;
+    }
+    if(p) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+        __curand_matcopy(matrix, precalc_xorwow_matrix[PRECALC_NUM_MATRICES - 1], n);
+        __curand_matcopy(matrixA, precalc_xorwow_matrix[PRECALC_NUM_MATRICES - 1], n);
+,
+        __curand_matcopy(matrix, precalc_xorwow_matrix_host[PRECALC_NUM_MATRICES - 1], n);
+        __curand_matcopy(matrixA, precalc_xorwow_matrix_host[PRECALC_NUM_MATRICES - 1], n);
+)
+    }
+    while(p) {
+        for(unsigned int t = 0; t < (p & SKIPAHEAD_MASK); t++) {
+            __curand_matvec(vector, matrixA, result, n);
+            __curand_veccopy(vector, result, n);
+        }
+        p >>= SKIPAHEAD_BLOCKSIZE;
+        if(p) {
+            for(int i = 0; i < SKIPAHEAD_BLOCKSIZE; i++) {
+                __curand_matmat(matrix, matrixA, n);
+                __curand_matcopy(matrixA, matrix, n);
+            }
+        }
+    }
+    for(int i = 0; i < n; i++) {
+        state->v[i] = vector[i];
+    }
+    /* No update of state->d needed, guaranteed to be a multiple of 2^32 */
+}
+template <typename T, int N>
+QUALIFIERS void _skipahead_inplace(const unsigned long long x, T *state)
+{
+    unsigned long long p = x;
+    int matrix_num = 0;
+    while(p) {
+        for(unsigned int t = 0; t < (p & PRECALC_BLOCK_MASK); t++) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+            __curand_matvec_inplace<N>(state->v, precalc_xorwow_offset_matrix[matrix_num]);
+,
+            __curand_matvec_inplace<N>(state->v, precalc_xorwow_offset_matrix_host[matrix_num]);
+)
+        }
+        p >>= PRECALC_BLOCK_SIZE;
+        matrix_num++;
+    }
+    state->d += 362437 * (unsigned int)x;
+}
+template <typename T, int N>
+QUALIFIERS void _skipahead_sequence_inplace(unsigned long long x, T *state)
+{
+    int matrix_num = 0;
+    while(x) {
+        for(unsigned int t = 0; t < (x & PRECALC_BLOCK_MASK); t++) {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+            __curand_matvec_inplace<N>(state->v, precalc_xorwow_matrix[matrix_num]);
+,
+            __curand_matvec_inplace<N>(state->v, precalc_xorwow_matrix_host[matrix_num]);
+)
+        }
+        x >>= PRECALC_BLOCK_SIZE;
+        matrix_num++;
+    }
+    /* No update of state->d needed, guaranteed to be a multiple of 2^32 */
+}
+/**
+ * \brief Update XORWOW state to skip \p n elements.
+ *
+ * Update the XORWOW state in \p state to skip ahead \p n elements.
+ *
+ * All values of \p n are valid.  Large values require more computation and so
+ * will take more time to complete.
+ *
+ * \param n - Number of elements to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead(unsigned long long n, curandStateXORWOW_t *state)
+{
+    _skipahead_inplace<curandStateXORWOW_t, 5>(n, state);
+}
+/**
+ * \brief Update XORWOW state to skip ahead \p n subsequences.
+ *
+ * Update the XORWOW state in \p state to skip ahead \p n subsequences.  Each
+ * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
+ * \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly  * n elements.
+ *
+ * All values of \p n are valid.  Large values require more computation and so
+ * will take more time to complete.
+ *
+ * \param n - Number of subsequences to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead_sequence(unsigned long long n, curandStateXORWOW_t *state)
+{
+    _skipahead_sequence_inplace<curandStateXORWOW_t, 5>(n, state);
+}
+QUALIFIERS void _curand_init_scratch(unsigned long long seed,
+                                     unsigned long long subsequence,
+                                     unsigned long long offset,
+                                     curandStateXORWOW_t *state,
+                                     unsigned int *scratch)
+{
+    // Break up seed, apply salt
+    // Constants are arbitrary nonzero values
+    unsigned int s0 = ((unsigned int)seed) ^ 0xaad26b49UL;
+    unsigned int s1 = (unsigned int)(seed >> 32) ^ 0xf7dcefddUL;
+    // Simple multiplication to mix up bits
+    // Constants are arbitrary odd values
+    unsigned int t0 = 1099087573UL * s0;
+    unsigned int t1 = 2591861531UL * s1;
+    state->d = 6615241 + t1 + t0;
+    state->v[0] = 123456789UL + t0;
+    state->v[1] = 362436069UL ^ t0;
+    state->v[2] = 521288629UL + t1;
+    state->v[3] = 88675123UL ^ t1;
+    state->v[4] = 5783321UL + t0;
+    _skipahead_sequence_scratch<curandStateXORWOW_t, 5>(subsequence, state, scratch);
+    _skipahead_scratch<curandStateXORWOW_t, 5>(offset, state, scratch);
+    state->boxmuller_flag = 0;
+    state->boxmuller_flag_double = 0;
+    state->boxmuller_extra = 0.f;
+    state->boxmuller_extra_double = 0.;
+}
+QUALIFIERS void _curand_init_inplace(unsigned long long seed,
+                                     unsigned long long subsequence,
+                                     unsigned long long offset,
+                                     curandStateXORWOW_t *state)
+{
+    // Break up seed, apply salt
+    // Constants are arbitrary nonzero values
+    unsigned int s0 = ((unsigned int)seed) ^ 0xaad26b49UL;
+    unsigned int s1 = (unsigned int)(seed >> 32) ^ 0xf7dcefddUL;
+    // Simple multiplication to mix up bits
+    // Constants are arbitrary odd values
+    unsigned int t0 = 1099087573UL * s0;
+    unsigned int t1 = 2591861531UL * s1;
+    state->d = 6615241 + t1 + t0;
+    state->v[0] = 123456789UL + t0;
+    state->v[1] = 362436069UL ^ t0;
+    state->v[2] = 521288629UL + t1;
+    state->v[3] = 88675123UL ^ t1;
+    state->v[4] = 5783321UL + t0;
+    _skipahead_sequence_inplace<curandStateXORWOW_t, 5>(subsequence, state);
+    _skipahead_inplace<curandStateXORWOW_t, 5>(offset, state);
+    state->boxmuller_flag = 0;
+    state->boxmuller_flag_double = 0;
+    state->boxmuller_extra = 0.f;
+    state->boxmuller_extra_double = 0.;
+}
+/**
+ * \brief Initialize XORWOW state.
+ *
+ * Initialize XORWOW state in \p state with the given \p seed, \p subsequence,
+ * and \p offset.
+ *
+ * All input values of \p seed, \p subsequence, and \p offset are legal.  Large
+ * values for \p subsequence and \p offset require more computation and so will
+ * take more time to complete.
+ *
+ * A value of 0 for \p seed sets the state to the values of the original
+ * published version of the \p xorwow algorithm.
+ *
+ * \param seed - Arbitrary bits to use as a seed
+ * \param subsequence - Subsequence to start at
+ * \param offset - Absolute offset into sequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(unsigned long long seed,
+                            unsigned long long subsequence,
+                            unsigned long long offset,
+                            curandStateXORWOW_t *state)
+{
+    _curand_init_inplace(seed, subsequence, offset, state);
+}
+/**
+ * \brief Return 32-bits of pseudorandomness from an XORWOW generator.
+ *
+ * Return 32-bits of pseudorandomness from the XORWOW generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand(curandStateXORWOW_t *state)
+{
+    unsigned int t;
+    t = (state->v[0] ^ (state->v[0] >> 2));
+    state->v[0] = state->v[1];
+    state->v[1] = state->v[2];
+    state->v[2] = state->v[3];
+    state->v[3] = state->v[4];
+    state->v[4] = (state->v[4] ^ (state->v[4] <<4)) ^ (t ^ (t << 1));
+    state->d += 362437;
+    return state->v[4] + state->d;
+}
+/**
+ * \brief Return 32-bits of pseudorandomness from an Philox4_32_10 generator.
+ *
+ * Return 32-bits of pseudorandomness from the Philox4_32_10 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand(curandStatePhilox4_32_10_t *state)
+{
+    // Maintain the invariant: output[STATE] is always "good" and
+    //  is the next value to be returned by curand.
+    unsigned int ret;
+    switch(state->STATE++){
+    default:
+        ret = state->output.x;
+        break;
+    case 1:
+        ret = state->output.y;
+        break;
+    case 2:
+        ret = state->output.z;
+        break;
+    case 3:
+        ret = state->output.w;
+        break;
+    }
+    if(state->STATE == 4){
+        Philox_State_Incr(state);
+        state->output = curand_Philox4x32_10(state->ctr,state->key);
+        state->STATE = 0;
+    }
+    return ret;
+}
+/**
+ * \brief Return tuple of 4 32-bit pseudorandoms from a Philox4_32_10 generator.
+ *
+ * Return 128 bits of pseudorandomness from the Philox4_32_10 generator in \p state,
+ * increment position of generator by four.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 128-bits of pseudorandomness as a uint4, all bits valid to use.
+ */
+QUALIFIERS uint4 curand4(curandStatePhilox4_32_10_t *state)
+{
+    uint4 r;
+    uint4 tmp = state->output;
+    Philox_State_Incr(state);
+    state->output= curand_Philox4x32_10(state->ctr,state->key);
+    switch(state->STATE){
+    case 0:
+        return tmp;
+    case 1:
+        r.x = tmp.y;
+        r.y = tmp.z;
+        r.z = tmp.w;
+        r.w = state->output.x;
+        break;
+    case 2:
+        r.x = tmp.z;
+        r.y = tmp.w;
+        r.z = state->output.x;
+        r.w = state->output.y;
+        break;
+    case 3:
+        r.x = tmp.w;
+        r.y = state->output.x;
+        r.z = state->output.y;
+        r.w = state->output.z;
+        break;
+    default:
+        // NOT possible but needed to avoid compiler warnings
+        return tmp;
+    }
+    return r;
+}
+/**
+ * \brief Update Philox4_32_10 state to skip \p n elements.
+ *
+ * Update the Philox4_32_10 state in \p state to skip ahead \p n elements.
+ *
+ * All values of \p n are valid.
+ *
+ * \param n - Number of elements to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead(unsigned long long n, curandStatePhilox4_32_10_t *state)
+{
+    state->STATE += (n & 3);
+    n /= 4;
+    if( state->STATE > 3 ){
+        n += 1;
+        state->STATE -= 4;
+    }
+    Philox_State_Incr(state, n);
+    state->output = curand_Philox4x32_10(state->ctr,state->key);
+}
+/**
+ * \brief Update Philox4_32_10 state to skip ahead \p n subsequences.
+ *
+ * Update the Philox4_32_10 state in \p state to skip ahead \p n subsequences.  Each
+ * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
+ * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly * n elements.
+ *
+ * All values of \p n are valid.
+ *
+ * \param n - Number of subsequences to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead_sequence(unsigned long long n, curandStatePhilox4_32_10_t *state)
+{
+    Philox_State_Incr_hi(state, n);
+    state->output = curand_Philox4x32_10(state->ctr,state->key);
+}
+/**
+ * \brief Initialize Philox4_32_10 state.
+ *
+ * Initialize Philox4_32_10 state in \p state with the given \p seed, p\ subsequence,
+ * and \p offset.
+ *
+ * All input values for \p seed, \p subseqence and \p offset are legal.  Each of the
+ * \xmlonly<ph outputclass="xmlonly">2<sup>64</sup></ph>\endxmlonly possible
+ * values of seed selects an independent sequence of length
+ * \xmlonly<ph outputclass="xmlonly">2<sup>130</sup></ph>\endxmlonly.
+ * The first
+ * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup> * subsequence + offset</ph>\endxmlonly.
+ * values of the sequence are skipped.
+ * I.e., subsequences are of length
+ * \xmlonly<ph outputclass="xmlonly">2<sup>66</sup></ph>\endxmlonly.
+ *
+ * \param seed - Arbitrary bits to use as a seed
+ * \param subsequence - Subsequence to start at
+ * \param offset - Absolute offset into subsequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(unsigned long long seed,
+                                 unsigned long long subsequence,
+                                 unsigned long long offset,
+                                 curandStatePhilox4_32_10_t *state)
+{
+    state->ctr = make_uint4(0, 0, 0, 0);
+    state->key.x = (unsigned int)seed;
+    state->key.y = (unsigned int)(seed>>32);
+    state->STATE = 0;
+    state->boxmuller_flag = 0;
+    state->boxmuller_flag_double = 0;
+    state->boxmuller_extra = 0.f;
+    state->boxmuller_extra_double = 0.;
+    skipahead_sequence(subsequence, state);
+    skipahead(offset, state);
+}
+/* MRG32k3a RNG */
+/* Base generator for MRG32k3a                                              */
+QUALIFIERS unsigned long long __curand_umad(GCC_UNUSED_PARAMETER unsigned int a, GCC_UNUSED_PARAMETER unsigned int b, GCC_UNUSED_PARAMETER unsigned long long c)
+{
+    unsigned long long r = 0;
+NV_IF_TARGET(NV_PROVIDES_SM_61,
+    asm("mad.wide.u32 %0, %1, %2, %3;"
+        : "=l"(r) : "r"(a), "r"(b), "l"(c));
+)
+    return r;
+}
+QUALIFIERS unsigned long long __curand_umul(GCC_UNUSED_PARAMETER unsigned int a, GCC_UNUSED_PARAMETER unsigned int b)
+{
+    unsigned long long r = 0;
+NV_IF_TARGET(NV_PROVIDES_SM_61,
+    asm("mul.wide.u32 %0, %1, %2;"
+        : "=l"(r) : "r"(a), "r"(b));
+)
+    return r;
+}
+QUALIFIERS double curand_MRG32k3a (curandStateMRG32k3a_t *state)
+{
+NV_IF_TARGET(NV_PROVIDES_SM_61,
+    const unsigned int m1  = 4294967087u;
+    const unsigned int m2  = 4294944443u;
+    const unsigned int m1c = 209u;
+    const unsigned int m2c = 22853u;
+    const unsigned int a12  = 1403580u;
+    const unsigned int a13n = 810728u;
+    const unsigned int a21  = 527612u;
+    const unsigned int a23n = 1370589u;
+    unsigned long long p1;
+    unsigned long long p2;
+    const unsigned long long p3 = __curand_umul(a13n, m1 - state->s1[0]);
+    p1 = __curand_umad(a12, state->s1[1], p3);
+    // Putting addition inside and changing umul to umad
+    // slowed this function down on GV100
+    p1 =  __curand_umul(p1 >> 32, m1c) + (p1 & 0xffffffff);
+    if (p1 >= m1) p1 -= m1;
+    state->s1[0] = state->s1[1]; state->s1[1] = state->s1[2]; state->s1[2] = p1;
+    const unsigned long long p4 = __curand_umul(a23n, m2 - state->s2[0]);
+    p2 = __curand_umad(a21, state->s2[2], p4);
+    // Putting addition inside and changing umul to umad
+    // slowed this function down on GV100
+    p2 =  __curand_umul(p2 >> 32, m2c) + (p2 & 0xffffffff);
+    p2 =  __curand_umul(p2 >> 32, m2c) + (p2 & 0xffffffff);
+    if (p2 >= m2) p2 -= m2;
+    state->s2[0] = state->s2[1]; state->s2[1] = state->s2[2]; state->s2[2] = p2;
+    const unsigned int p5 = (unsigned int)p1 - (unsigned int)p2;
+    if(p1 <= p2) return p5 + m1;
+    return p5;
+)
+NV_IF_TARGET(NV_IS_DEVICE,
+/*  nj's implementation */
+    const double m1 = 4294967087.;
+    const double m2 = 4294944443.;
+    const double a12  = 1403580.;
+    const double a13n = 810728.;
+    const double a21  = 527612.;
+    const double a23n = 1370589.;
+    const double rh1 =  2.3283065498378290e-010;  /* (1.0 / m1)__hi */
+    const double rl1 = -1.7354913086174288e-026;  /* (1.0 / m1)__lo */
+    const double rh2 =  2.3283188252407387e-010;  /* (1.0 / m2)__hi */
+    const double rl2 =  2.4081018096503646e-026;  /* (1.0 / m2)__lo */
+    double q;
+    double p1;
+    double p2;
+    p1 = a12 * state->s1[1] - a13n * state->s1[0];
+    q = trunc (fma (p1, rh1, p1 * rl1));
+    p1 -= q * m1;
+    if (p1 < 0.0) p1 += m1;
+    state->s1[0] = state->s1[1];   state->s1[1] = state->s1[2];   state->s1[2] = (unsigned int)p1;
+    p2 = a21 * state->s2[2] - a23n * state->s2[0];
+    q = trunc (fma (p2, rh2, p2 * rl2));
+    p2 -= q * m2;
+    if (p2 < 0.0) p2 += m2;
+    state->s2[0] = state->s2[1];   state->s2[1] = state->s2[2];   state->s2[2] = (unsigned int)p2;
+    if (p1 <= p2) return (p1 - p2 + m1);
+    else return (p1 - p2);
+)
+/* end nj's implementation */
+    double p1;
+    double p2;
+    double r;
+    p1 = (MRG32K3A_A12 * state->s1[1]) - (MRG32K3A_A13N * state->s1[0]);
+    p1 = curand_MRGmod(p1, MRG32K3A_MOD1);
+    if (p1 < 0.0) p1 += MRG32K3A_MOD1;
+    state->s1[0] = state->s1[1];
+    state->s1[1] = state->s1[2];
+    state->s1[2] = (unsigned int)p1;
+    p2 = (MRG32K3A_A21 * state->s2[2]) - (MRG32K3A_A23N * state->s2[0]);
+    p2 = curand_MRGmod(p2, MRG32K3A_MOD2);
+    if (p2 < 0) p2 += MRG32K3A_MOD2;
+    state->s2[0] = state->s2[1];
+    state->s2[1] = state->s2[2];
+    state->s2[2] = (unsigned int)p2;
+    r = p1 - p2;
+    if (r <= 0) r += MRG32K3A_MOD1;
+    return r;
+}
+/**
+ * \brief Return 32-bits of pseudorandomness from an MRG32k3a generator.
+ *
+ * Return 32-bits of pseudorandomness from the MRG32k3a generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand(curandStateMRG32k3a_t *state)
+{
+    double dRet;
+    dRet = (double)curand_MRG32k3a(state)*(double)MRG32K3A_BITS_NORM;
+    return (unsigned int)dRet;
+}
+/**
+ * \brief Update MRG32k3a state to skip \p n elements.
+ *
+ * Update the MRG32k3a state in \p state to skip ahead \p n elements.
+ *
+ * All values of \p n are valid.  Large values require more computation and so
+ * will take more time to complete.
+ *
+ * \param n - Number of elements to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead(unsigned long long n, curandStateMRG32k3a_t *state)
+{
+    unsigned int t[3][3];
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    curand_MRGmatPow3x3( mrg32k3aM1, t, MRG32K3A_MOD1, n);
+    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
+    curand_MRGmatPow3x3(mrg32k3aM2, t, MRG32K3A_MOD2, n);
+    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
+,
+    curand_MRGmatPow3x3( mrg32k3aM1Host, t, MRG32K3A_MOD1, n);
+    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
+    curand_MRGmatPow3x3(mrg32k3aM2Host, t, MRG32K3A_MOD2, n);
+    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
+)
+}
+/**
+ * \brief Update MRG32k3a state to skip ahead \p n subsequences.
+ *
+ * Update the MRG32k3a state in \p state to skip ahead \p n subsequences.  Each
+ * subsequence is \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly
+ *
+ * \xmlonly<ph outputclass="xmlonly">2<sup>76</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
+ * \xmlonly<ph outputclass="xmlonly">2<sup>67</sup></ph>\endxmlonly * n elements.
+ *
+ * Valid values of \p n are 0 to \xmlonly<ph outputclass="xmlonly">2<sup>51</sup></ph>\endxmlonly.  Note \p n will be masked to 51 bits
+ *
+ * \param n - Number of subsequences to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead_subsequence(unsigned long long n, curandStateMRG32k3a_t *state)
+{
+    unsigned int t[3][3];
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    curand_MRGmatPow3x3( mrg32k3aM1SubSeq, t, MRG32K3A_MOD1, n);
+    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
+    curand_MRGmatPow3x3( mrg32k3aM2SubSeq, t, MRG32K3A_MOD2, n);
+    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
+,
+    curand_MRGmatPow3x3( mrg32k3aM1SubSeqHost, t, MRG32K3A_MOD1, n);
+    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
+    curand_MRGmatPow3x3( mrg32k3aM2SubSeqHost, t, MRG32K3A_MOD2, n);
+    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
+)
+}
+/**
+ * \brief Update MRG32k3a state to skip ahead \p n sequences.
+ *
+ * Update the MRG32k3a state in \p state to skip ahead \p n sequences.  Each
+ * sequence is \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly elements long, so this means the function will skip ahead
+ * \xmlonly<ph outputclass="xmlonly">2<sup>127</sup></ph>\endxmlonly * n elements.
+ *
+ * All values of \p n are valid.  Large values require more computation and so
+ * will take more time to complete.
+ *
+ * \param n - Number of sequences to skip
+ * \param state - Pointer to state to update
+ */
+QUALIFIERS void skipahead_sequence(unsigned long long n, curandStateMRG32k3a_t *state)
+{
+    unsigned int t[3][3];
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    curand_MRGmatPow3x3( mrg32k3aM1Seq, t, MRG32K3A_MOD1, n);
+    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
+    curand_MRGmatPow3x3(  mrg32k3aM2Seq, t, MRG32K3A_MOD2, n);
+    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
+,
+    curand_MRGmatPow3x3( mrg32k3aM1SeqHost, t, MRG32K3A_MOD1, n);
+    curand_MRGmatVecMul3x3( t, state->s1, MRG32K3A_MOD1);
+    curand_MRGmatPow3x3(  mrg32k3aM2SeqHost, t, MRG32K3A_MOD2, n);
+    curand_MRGmatVecMul3x3( t, state->s2, MRG32K3A_MOD2);
+)
+}
+/**
+ * \brief Initialize MRG32k3a state.
+ *
+ * Initialize MRG32k3a state in \p state with the given \p seed, \p subsequence,
+ * and \p offset.
+ *
+ * All input values of \p seed, \p subsequence, and \p offset are legal.
+ * \p subsequence will be truncated to 51 bits to avoid running into the next sequence
+ *
+ * A value of 0 for \p seed sets the state to the values of the original
+ * published version of the \p MRG32k3a algorithm.
+ *
+ * \param seed - Arbitrary bits to use as a seed
+ * \param subsequence - Subsequence to start at
+ * \param offset - Absolute offset into sequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(unsigned long long seed,
+                            unsigned long long subsequence,
+                            unsigned long long offset,
+                            curandStateMRG32k3a_t *state)
+{
+    int i;
+    for ( i=0; i<3; i++ ) {
+        state->s1[i] = 12345u;
+        state->s2[i] = 12345u;
+    }
+    if (seed != 0ull) {
+        unsigned int x1 = ((unsigned int)seed) ^ 0x55555555UL;
+        unsigned int x2 = (unsigned int)((seed >> 32) ^ 0xAAAAAAAAUL);
+        state->s1[0] = (unsigned int)curand_MRGmodMul(x1, state->s1[0], MRG32K3A_MOD1);
+        state->s1[1] = (unsigned int)curand_MRGmodMul(x2, state->s1[1], MRG32K3A_MOD1);
+        state->s1[2] = (unsigned int)curand_MRGmodMul(x1, state->s1[2], MRG32K3A_MOD1);
+        state->s2[0] = (unsigned int)curand_MRGmodMul(x2, state->s2[0], MRG32K3A_MOD2);
+        state->s2[1] = (unsigned int)curand_MRGmodMul(x1, state->s2[1], MRG32K3A_MOD2);
+        state->s2[2] = (unsigned int)curand_MRGmodMul(x2, state->s2[2], MRG32K3A_MOD2);
+    }
+    skipahead_subsequence( subsequence, state );
+    skipahead( offset, state );
+    state->boxmuller_flag = 0;
+    state->boxmuller_flag_double = 0;
+    state->boxmuller_extra = 0.f;
+    state->boxmuller_extra_double = 0.;
+}
+/**
+ * \brief Update Sobol32 state to skip \p n elements.
+ *
+ * Update the Sobol32 state in \p state to skip ahead \p n elements.
+ *
+ * All values of \p n are valid.
+ *
+ * \param n - Number of elements to skip
+ * \param state - Pointer to state to update
+ */
+template <typename T>
+QUALIFIERS
+typename CURAND_STD::enable_if<CURAND_STD::is_same<curandStateSobol32_t*, T>::value || CURAND_STD::is_same<curandStateScrambledSobol32_t*, T>::value>::type
+skipahead(unsigned int n, T state)
+{
+    unsigned int i_gray;
+    state->x = state->c;
+    state->i += n;
+    /* Convert state->i to gray code */
+    i_gray = state->i ^ (state->i >> 1);
+    for(unsigned int k = 0; k < 32; k++) {
+        if(i_gray & (1 << k)) {
+            state->x ^= state->direction_vectors[k];
+        }
+    }
+    return;
+}
+/**
+ * \brief Update Sobol64 state to skip \p n elements.
+ *
+ * Update the Sobol64 state in \p state to skip ahead \p n elements.
+ *
+ * All values of \p n are valid.
+ *
+ * \param n - Number of elements to skip
+ * \param state - Pointer to state to update
+ */
+template <typename T>
+QUALIFIERS
+typename CURAND_STD::enable_if<CURAND_STD::is_same<curandStateSobol64_t*, T>::value || CURAND_STD::is_same<curandStateScrambledSobol64_t*, T>::value>::type
+skipahead(unsigned long long n, T state)
+{
+    unsigned long long i_gray;
+    state->x = state->c;
+    state->i += n;
+    /* Convert state->i to gray code */
+    i_gray = state->i ^ (state->i >> 1);
+    for(unsigned k = 0; k < 64; k++) {
+        if(i_gray & (1ULL << k)) {
+            state->x ^= state->direction_vectors[k];
+        }
+    }
+    return;
+}
+/**
+ * \brief Initialize Sobol32 state.
+ *
+ * Initialize Sobol32 state in \p state with the given \p direction \p vectors and
+ * \p offset.
+ *
+ * The direction vector is a device pointer to an array of 32 unsigned ints.
+ * All input values of \p offset are legal.
+ *
+ * \param direction_vectors - Pointer to array of 32 unsigned ints representing the
+ * direction vectors for the desired dimension
+ * \param offset - Absolute offset into sequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(curandDirectionVectors32_t direction_vectors,
+                                            unsigned int offset,
+                                            curandStateSobol32_t *state)
+{
+    state->i = 0;
+    state->c = 0;
+    for(int i = 0; i < 32; i++) {
+        state->direction_vectors[i] = direction_vectors[i];
+    }
+    state->x = 0;
+    skipahead<curandStateSobol32_t *>(offset, state);
+}
+/**
+ * \brief Initialize Scrambled Sobol32 state.
+ *
+ * Initialize Sobol32 state in \p state with the given \p direction \p vectors and
+ * \p offset.
+ *
+ * The direction vector is a device pointer to an array of 32 unsigned ints.
+ * All input values of \p offset are legal.
+ *
+ * \param direction_vectors - Pointer to array of 32 unsigned ints representing the
+ direction vectors for the desired dimension
+ * \param scramble_c Scramble constant
+ * \param offset - Absolute offset into sequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(curandDirectionVectors32_t direction_vectors,
+                                            unsigned int scramble_c,
+                                            unsigned int offset,
+                                            curandStateScrambledSobol32_t *state)
+{
+    state->i = 0;
+    state->c = scramble_c;
+    for(int i = 0; i < 32; i++) {
+        state->direction_vectors[i] = direction_vectors[i];
+    }
+    state->x = state->c;
+    skipahead<curandStateScrambledSobol32_t *>(offset, state);
+}
+QUALIFIERS int __curand_find_trailing_zero(unsigned int x)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    int y = __ffs(~x);
+    if(y)
+        return y - 1;
+    return 31;
+,
+    int i = 1;
+    while(x & 1) {
+        i++;
+        x >>= 1;
+    }
+    i = i - 1;
+    return i == 32 ? 31 : i;
+)
+}
+QUALIFIERS int __curand_find_trailing_zero(unsigned long long x)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    int y = __ffsll(~x);
+    if(y)
+        return y - 1;
+    return 63;
+,
+    int i = 1;
+    while(x & 1) {
+        i++;
+        x >>= 1;
+    }
+    i = i - 1;
+    return i == 64 ? 63 : i;
+)
+}
+/**
+ * \brief Initialize Sobol64 state.
+ *
+ * Initialize Sobol64 state in \p state with the given \p direction \p vectors and
+ * \p offset.
+ *
+ * The direction vector is a device pointer to an array of 64 unsigned long longs.
+ * All input values of \p offset are legal.
+ *
+ * \param direction_vectors - Pointer to array of 64 unsigned long longs representing the
+ direction vectors for the desired dimension
+ * \param offset - Absolute offset into sequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(curandDirectionVectors64_t direction_vectors,
+                                            unsigned long long offset,
+                                            curandStateSobol64_t *state)
+{
+    state->i = 0;
+    state->c = 0;
+    for(int i = 0; i < 64; i++) {
+        state->direction_vectors[i] = direction_vectors[i];
+    }
+    state->x = 0;
+    skipahead<curandStateSobol64_t *>(offset, state);
+}
+/**
+ * \brief Initialize Scrambled Sobol64 state.
+ *
+ * Initialize Sobol64 state in \p state with the given \p direction \p vectors and
+ * \p offset.
+ *
+ * The direction vector is a device pointer to an array of 64 unsigned long longs.
+ * All input values of \p offset are legal.
+ *
+ * \param direction_vectors - Pointer to array of 64 unsigned long longs representing the
+ direction vectors for the desired dimension
+ * \param scramble_c Scramble constant
+ * \param offset - Absolute offset into sequence
+ * \param state - Pointer to state to initialize
+ */
+QUALIFIERS void curand_init(curandDirectionVectors64_t direction_vectors,
+                                            unsigned long long scramble_c,
+                                            unsigned long long offset,
+                                            curandStateScrambledSobol64_t *state)
+{
+    state->i = 0;
+    state->c = scramble_c;
+    for(int i = 0; i < 64; i++) {
+        state->direction_vectors[i] = direction_vectors[i];
+    }
+    state->x = state->c;
+    skipahead<curandStateScrambledSobol64_t *>(offset, state);
+}
+/**
+ * \brief Return 32-bits of quasirandomness from a Sobol32 generator.
+ *
+ * Return 32-bits of quasirandomness from the Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 32-bits of quasirandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand(curandStateSobol32_t * state)
+{
+    /* Moving from i to i+1 element in gray code is flipping one bit,
+       the trailing zero bit of i
+    */
+    unsigned int res = state->x;
+    state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
+    state->i ++;
+    return res;
+}
+/**
+ * \brief Return 32-bits of quasirandomness from a scrambled Sobol32 generator.
+ *
+ * Return 32-bits of quasirandomness from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 32-bits of quasirandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand(curandStateScrambledSobol32_t * state)
+{
+    /* Moving from i to i+1 element in gray code is flipping one bit,
+       the trailing zero bit of i
+    */
+    unsigned int res = state->x;
+    state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
+    state->i ++;
+    return res;
+}
+/**
+ * \brief Return 64-bits of quasirandomness from a Sobol64 generator.
+ *
+ * Return 64-bits of quasirandomness from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 64-bits of quasirandomness as an unsigned long long, all bits valid to use.
+ */
+QUALIFIERS unsigned long long curand(curandStateSobol64_t * state)
+{
+    /* Moving from i to i+1 element in gray code is flipping one bit,
+       the trailing zero bit of i
+    */
+    unsigned long long res = state->x;
+    state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
+    state->i ++;
+    return res;
+}
+/**
+ * \brief Return 64-bits of quasirandomness from a scrambled Sobol64 generator.
+ *
+ * Return 64-bits of quasirandomness from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 64-bits of quasirandomness as an unsigned long long, all bits valid to use.
+ */
+QUALIFIERS unsigned long long curand(curandStateScrambledSobol64_t * state)
+{
+    /* Moving from i to i+1 element in gray code is flipping one bit,
+       the trailing zero bit of i
+    */
+    unsigned long long res = state->x;
+    state->x ^= state->direction_vectors[__curand_find_trailing_zero(state->i)];
+    state->i ++;
+    return res;
+}
+#include "curand_uniform.h"
+#include "curand_normal.h"
+#include "curand_lognormal.h"
+#include "curand_poisson.h"
+#include "curand_discrete2.h"
+__device__ static inline unsigned int *__get_precalculated_matrix(int n)
+{
+    if(n == 0) {
+        return precalc_xorwow_matrix[n];
+    }
+    if(n == 2) {
+        return precalc_xorwow_offset_matrix[n];
+    }
+    return precalc_xorwow_matrix[n];
+}
+#ifndef __CUDACC_RTC__
+__host__ static inline unsigned int *__get_precalculated_matrix_host(int n)
+{
+    if(n == 1) {
+        return precalc_xorwow_matrix_host[n];
+    }
+    if(n == 3) {
+        return precalc_xorwow_offset_matrix_host[n];
+    }
+    return precalc_xorwow_matrix_host[n];
+}
+#endif // #ifndef __CUDACC_RTC__
+__device__ static inline unsigned int *__get_mrg32k3a_matrix(int n)
+{
+    if(n == 0) {
+        return mrg32k3aM1[n][0];
+    }
+    if(n == 2) {
+        return mrg32k3aM2[n][0];
+    }
+    if(n == 4) {
+        return mrg32k3aM1SubSeq[n][0];
+    }
+    if(n == 6) {
+        return mrg32k3aM2SubSeq[n][0];
+    }
+    if(n == 8) {
+        return mrg32k3aM1Seq[n][0];
+    }
+    if(n == 10) {
+        return mrg32k3aM2Seq[n][0];
+    }
+    return mrg32k3aM1[n][0];
+}
+#ifndef __CUDACC_RTC__
+__host__ static inline unsigned int *__get_mrg32k3a_matrix_host(int n)
+{
+    if(n == 1) {
+        return mrg32k3aM1Host[n][0];
+    }
+    if(n == 3) {
+        return mrg32k3aM2Host[n][0];
+    }
+    if(n == 5) {
+        return mrg32k3aM1SubSeqHost[n][0];
+    }
+    if(n == 7) {
+        return mrg32k3aM2SubSeqHost[n][0];
+    }
+    if(n == 9) {
+        return mrg32k3aM1SeqHost[n][0];
+    }
+    if(n == 11) {
+        return mrg32k3aM2SeqHost[n][0];
+    }
+    return mrg32k3aM1Host[n][0];
+}
+__host__ static inline double *__get__cr_lgamma_table_host(void) {
+    return __cr_lgamma_table;
+}
+#endif // #ifndef __CUDACC_RTC__
+/** @} */
+#endif // !defined(CURAND_KERNEL_H_)

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_lognormal.h ADDED Viewed

	@@ -0,0 +1,697 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#if !defined(CURAND_LOGNORMAL_H_)
+#define CURAND_LOGNORMAL_H_
+/**
+ * \defgroup DEVICE Device API
+ *
+ * @{
+ */
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif // __CUDACC_RTC__
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_philox4x32_x.h"
+/**
+ * \brief Return a log-normally distributed float from an XORWOW generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the XORWOW generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, transforms them to log-normal distribution,
+ * then returns them one at a time.
+ * See ::curand_log_normal2() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state  - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateXORWOW_t *state, float mean, float stddev)
+{
+    if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
+        unsigned int x, y;
+        x = curand(state);
+        y = curand(state);
+        float2 v = _curand_box_muller(x, y);
+        state->boxmuller_extra = expf(mean + (stddev * v.y));
+        state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
+        return expf(mean + (stddev * v.x));
+    }
+    state->boxmuller_flag = 0;
+    return state->boxmuller_extra;
+}
+/**
+ * \brief Return a log-normally distributed float from an Philox4_32_10 generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Philox4_32_10 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, transforms them to log-normal distribution,
+ * then returns them one at a time.
+ * See ::curand_log_normal2() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state  - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStatePhilox4_32_10_t *state, float mean, float stddev)
+{
+    if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
+        unsigned int x, y;
+        x = curand(state);
+        y = curand(state);
+        float2 v = _curand_box_muller(x, y);
+        state->boxmuller_extra = expf(mean + (stddev * v.y));
+        state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
+        return expf(mean + (stddev * v.x));
+    }
+    state->boxmuller_flag = 0;
+    return state->boxmuller_extra;
+}
+/**
+ * \brief Return two normally distributed floats from an XORWOW generator.
+ *
+ * Return two log-normally distributed floats derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the XORWOW generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then transforms them to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float2 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float2 curand_log_normal2(curandStateXORWOW_t *state, float mean, float stddev)
+{
+    float2 v = curand_box_muller(state);
+    v.x = expf(mean + (stddev * v.x));
+    v.y = expf(mean + (stddev * v.y));
+    return v;
+}
+/**
+ * \brief Return two normally distributed floats from an Philox4_32_10 generator.
+ *
+ * Return two log-normally distributed floats derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Philox4_32_10 generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then transforms them to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float2 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float2 curand_log_normal2(curandStatePhilox4_32_10_t *state, float mean, float stddev)
+{
+    float2 v = curand_box_muller(state);
+    v.x = expf(mean + (stddev * v.x));
+    v.y = expf(mean + (stddev * v.y));
+    return v;
+}
+/**
+ * \brief Return four normally distributed floats from an Philox4_32_10 generator.
+ *
+ * Return four log-normally distributed floats derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Philox4_32_10 generator in \p state,
+ * increment position of generator by four.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then transforms them to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float4 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float4 curand_log_normal4(curandStatePhilox4_32_10_t *state, float mean, float stddev)
+{
+    float4 v = curand_box_muller4(state);
+    v.x = expf(mean + (stddev * v.x));
+    v.y = expf(mean + (stddev * v.y));
+    v.z = expf(mean + (stddev * v.z));
+    v.w = expf(mean + (stddev * v.w));
+    return v;
+}
+/**
+ * \brief Return a log-normally distributed float from an MRG32k3a generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the MRG32k3a generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, transforms them to log-normal distribution,
+ * then returns them one at a time.
+ * See ::curand_log_normal2() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state  - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateMRG32k3a_t *state, float mean, float stddev)
+{
+    if(state->boxmuller_flag != EXTRA_FLAG_LOG_NORMAL) {
+        float2 v = curand_box_muller_mrg(state);
+        state->boxmuller_extra = expf(mean + (stddev * v.y));
+        state->boxmuller_flag = EXTRA_FLAG_LOG_NORMAL;
+        return expf(mean + (stddev * v.x));
+    }
+    state->boxmuller_flag = 0;
+    return state->boxmuller_extra;
+}
+/**
+ * \brief Return two normally distributed floats from an MRG32k3a generator.
+ *
+ * Return two log-normally distributed floats derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the MRG32k3a generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then transforms them to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float2 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float2 curand_log_normal2(curandStateMRG32k3a_t *state, float mean, float stddev)
+{
+    float2 v = curand_box_muller_mrg(state);
+    v.x = expf(mean + (stddev * v.x));
+    v.y = expf(mean + (stddev * v.y));
+    return v;
+}
+/**
+ * \brief Return a log-normally distributed float from an MTGP32 generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the MTGP32 generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate a normally distributed result, then transforms the result
+ * to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateMtgp32_t *state, float mean, float stddev)
+{
+    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed float from a Sobol32 generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate a normally distributed result, then transforms the result
+ * to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateSobol32_t *state, float mean, float stddev)
+{
+    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed float from a scrambled Sobol32 generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate a normally distributed result, then transforms the result
+ * to log-normal.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateScrambledSobol32_t *state, float mean, float stddev)
+{
+    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed float from a Sobol64 generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results, then converts to log-normal
+ * distribution.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateSobol64_t *state, float mean, float stddev)
+{
+    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed float from a scrambled Sobol64 generator.
+ *
+ * Return a single log-normally distributed float derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the scrambled Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results, then converts to log-normal
+ * distribution.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed float with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS float curand_log_normal(curandStateScrambledSobol64_t *state, float mean, float stddev)
+{
+    return expf(mean + (stddev * _curand_normal_icdf(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed double from an XORWOW generator.
+ *
+ * Return a single normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the XORWOW generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, transforms them to log-normal distribution,
+ * then returns them one at a time.
+ * See ::curand_log_normal2_double() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateXORWOW_t *state, double mean, double stddev)
+{
+    if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
+        unsigned int x0, x1, y0, y1;
+        x0 = curand(state);
+        x1 = curand(state);
+        y0 = curand(state);
+        y1 = curand(state);
+        double2 v = _curand_box_muller_double(x0, x1, y0, y1);
+        state->boxmuller_extra_double = exp(mean + (stddev * v.y));
+        state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
+        return exp(mean + (stddev * v.x));
+    }
+    state->boxmuller_flag_double = 0;
+    return state->boxmuller_extra_double;
+}
+/**
+ * \brief Return a log-normally distributed double from an Philox4_32_10 generator.
+ *
+ * Return a single normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Philox4_32_10 generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, transforms them to log-normal distribution,
+ * then returns them one at a time.
+ * See ::curand_log_normal2_double() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
+{
+    if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
+        uint4 _x;
+        _x = curand4(state);
+        double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
+        state->boxmuller_extra_double = exp(mean + (stddev * v.y));
+        state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
+        return exp(mean + (stddev * v.x));
+    }
+    state->boxmuller_flag_double = 0;
+    return state->boxmuller_extra_double;
+}
+/**
+ * \brief Return two log-normally distributed doubles from an XORWOW generator.
+ *
+ * Return two log-normally distributed doubles derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the XORWOW generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, and transforms them to log-normal distribution,.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double2 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double2 curand_log_normal2_double(curandStateXORWOW_t *state, double mean, double stddev)
+{
+    double2 v = curand_box_muller_double(state);
+    v.x = exp(mean + (stddev * v.x));
+    v.y = exp(mean + (stddev * v.y));
+    return v;
+}
+/**
+ * \brief Return two log-normally distributed doubles from an Philox4_32_10 generator.
+ *
+ * Return two log-normally distributed doubles derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Philox4_32_10 generator in \p state,
+ * increment position of generator by four.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, and transforms them to log-normal distribution,.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double4 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double2 curand_log_normal2_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
+{
+    double2 v = curand_box_muller2_double(state);
+    v.x = exp(mean + (stddev * v.x));
+    v.y = exp(mean + (stddev * v.y));
+    return v;
+}
+// nor part of API
+QUALIFIERS double4 curand_log_normal4_double(curandStatePhilox4_32_10_t *state, double mean, double stddev)
+{
+    double4 v = curand_box_muller4_double(state);
+    v.x = exp(mean + (stddev * v.x));
+    v.y = exp(mean + (stddev * v.y));
+    v.z = exp(mean + (stddev * v.z));
+    v.w = exp(mean + (stddev * v.w));
+    return v;
+}
+/**
+ * \brief Return a log-normally distributed double from an MRG32k3a generator.
+ *
+ * Return a single normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the MRG32k3a generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, transforms them to log-normal distribution,
+ * then returns them one at a time.
+ * See ::curand_log_normal2_double() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateMRG32k3a_t *state, double mean, double stddev)
+{
+    if(state->boxmuller_flag_double != EXTRA_FLAG_LOG_NORMAL) {
+        double2 v = curand_box_muller_mrg_double(state);
+        state->boxmuller_extra_double = exp(mean + (stddev * v.y));
+        state->boxmuller_flag_double = EXTRA_FLAG_LOG_NORMAL;
+        return exp(mean + (stddev * v.x));
+    }
+    state->boxmuller_flag_double = 0;
+    return state->boxmuller_extra_double;
+}
+/**
+ * \brief Return two log-normally distributed doubles from an MRG32k3a generator.
+ *
+ * Return two log-normally distributed doubles derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the MRG32k3a generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, and transforms them to log-normal distribution,.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double2 where each element is from a
+ * distribution with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double2 curand_log_normal2_double(curandStateMRG32k3a_t *state, double mean, double stddev)
+{
+    double2 v = curand_box_muller_mrg_double(state);
+    v.x = exp(mean + (stddev * v.x));
+    v.y = exp(mean + (stddev * v.y));
+    return v;
+}
+/**
+ * \brief Return a log-normally distributed double from an MTGP32 generator.
+ *
+ * Return a single log-normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the MTGP32 generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results, and transforms them into
+ * log-normal distribution.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateMtgp32_t *state, double mean, double stddev)
+{
+    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed double from a Sobol32 generator.
+ *
+ * Return a single log-normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results, and transforms them into
+ * log-normal distribution.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateSobol32_t *state, double mean, double stddev)
+{
+    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed double from a scrambled Sobol32 generator.
+ *
+ * Return a single log-normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results, and transforms them into
+ * log-normal distribution.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol32_t *state, double mean, double stddev)
+{
+    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed double from a Sobol64 generator.
+ *
+ * Return a single normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateSobol64_t *state, double mean, double stddev)
+{
+    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
+}
+/**
+ * \brief Return a log-normally distributed double from a scrambled Sobol64 generator.
+ *
+ * Return a single normally distributed double derived from a normal
+ * distribution with mean \p mean and standard deviation \p stddev
+ * from the scrambled Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ * \param mean   - Mean of the related normal distribution
+ * \param stddev - Standard deviation of the related normal distribution
+ *
+ * \return Log-normally distributed double with mean \p mean and standard deviation \p stddev
+ */
+QUALIFIERS double curand_log_normal_double(curandStateScrambledSobol64_t *state, double mean, double stddev)
+{
+    return exp(mean + (stddev * _curand_normal_icdf_double(curand(state))));
+}
+#endif // !defined(CURAND_LOGNORMAL_H_)

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mrg32k3a.h ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32.h ADDED Viewed

	@@ -0,0 +1,210 @@

+/*
+ * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef CURAND_MTGP32_H
+#define CURAND_MTGP32_H
+/*
+ * @file curand_mtgp32.h
+ *
+ * @brief Mersenne Twister for Graphic Processors (mtgp32), which
+ * generates 32-bit unsigned integers and single precision floating
+ * point numbers based on IEEE 754 format.
+ *
+ * @author Mutsuo Saito (Hiroshima University)
+ * @author Makoto Matsumoto (Hiroshima University)
+ *
+ */
+/*
+ * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University.  All rights reserved.
+ * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+ * University and University of Tokyo.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials provided
+ *       with the distribution.
+ *     * Neither the name of the Hiroshima University nor the names of
+ *       its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#define MTGPDC_MEXP 11213
+#define MTGPDC_N 351
+#define MTGPDC_FLOOR_2P 256
+#define MTGPDC_CEIL_2P 512
+#define MTGPDC_PARAM_TABLE mtgp32dc_params_fast_11213
+#define MTGP32_STATE_SIZE 1024
+#define MTGP32_STATE_MASK 1023
+#define CURAND_NUM_MTGP32_PARAMS 200
+#define MEXP 11213
+#define THREAD_NUM MTGPDC_FLOOR_2P
+#define LARGE_SIZE (THREAD_NUM * 3)
+#define TBL_SIZE 16
+/**
+ * \addtogroup DEVICE Device API
+ *
+ * @{
+ */
+/*
+ * \struct MTGP32_PARAMS_FAST_T
+ * MTGP32 parameters.
+ * Some element is redundant to keep structure simple.
+ *
+ * \b pos is a pick up position which is selected to have good
+ * performance on graphic processors.  3 < \b pos < Q, where Q is a
+ * maximum number such that the size of status array - Q is a power of
+ * 2.  For example, when \b mexp is 44497, size of 32-bit status array
+ * is 696, and Q is 184, then \b pos is between 4 and 183. This means
+ * 512 parallel calculations is allowed when \b mexp is 44497.
+ *
+ * \b poly_sha1 is SHA1 digest of the characteristic polynomial of
+ * state transition function. SHA1 is calculated based on printing
+ * form of the polynomial. This is important when we use parameters
+ * generated by the dynamic creator which
+ *
+ * \b mask This is a mask to make the dimension of state space have
+ * just Mersenne Prime. This is redundant.
+ */
+struct mtgp32_params_fast;
+struct mtgp32_params_fast {
+    int mexp;			/*< Mersenne exponent. This is redundant. */
+    int pos;			/*< pick up position. */
+    int sh1;			/*< shift value 1. 0 < sh1 < 32. */
+    int sh2;			/*< shift value 2. 0 < sh2 < 32. */
+    unsigned int tbl[16];		/*< a small matrix. */
+    unsigned int tmp_tbl[16];	/*< a small matrix for tempering. */
+    unsigned int flt_tmp_tbl[16];	/*< a small matrix for tempering and
+                 converting to float. */
+    unsigned int mask;		/*< This is a mask for state space */
+    unsigned char poly_sha1[21]; /*< SHA1 digest */
+};
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct mtgp32_params_fast mtgp32_params_fast_t;
+/** \endcond */
+/*
+ * Generator Parameters.
+ */
+struct mtgp32_kernel_params;
+struct mtgp32_kernel_params {
+    unsigned int pos_tbl[CURAND_NUM_MTGP32_PARAMS];
+    unsigned int param_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
+    unsigned int temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
+    unsigned int single_temper_tbl[CURAND_NUM_MTGP32_PARAMS][TBL_SIZE];
+    unsigned int sh1_tbl[CURAND_NUM_MTGP32_PARAMS];
+    unsigned int sh2_tbl[CURAND_NUM_MTGP32_PARAMS];
+    unsigned int mask[1];
+};
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct mtgp32_kernel_params mtgp32_kernel_params_t;
+/** \endcond */
+/*
+ * kernel I/O
+ * This structure must be initialized before first use.
+ */
+/* MTGP (Mersenne Twister) RNG */
+/* This generator uses the Mersenne Twister algorithm of
+ * http://arxiv.org/abs/1005.4973v2
+ * Has period 2^11213.
+*/
+/**
+ * CURAND MTGP32 state
+ */
+struct curandStateMtgp32;
+struct curandStateMtgp32 {
+    unsigned int s[MTGP32_STATE_SIZE];
+    int offset;
+    int pIdx;
+    mtgp32_kernel_params_t * k;
+};
+/*
+ * CURAND MTGP32 state
+ */
+/** \cond UNHIDE_TYPEDEFS */
+typedef struct curandStateMtgp32 curandStateMtgp32_t;
+/** \endcond */
+/** @} */
+#endif

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_host.h ADDED Viewed

	@@ -0,0 +1,516 @@

+/*
+ * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ * curand_mtgp32_host.h
+ *
+ *
+ * MTGP32-11213
+ *
+ * Mersenne Twister RNG for the GPU
+ *
+ * The period of generated integers is 2<sup>11213</sup>-1.
+ *
+ * This code generates 32-bit unsigned integers, and
+ * single precision floating point numbers uniformly distributed
+ * in the range [1, 2). (float r; 1.0 <= r < 2.0)
+ */
+/*
+ * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University.  All rights reserved.
+ * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+ * University and University of Tokyo.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials provided
+ *       with the distribution.
+ *     * Neither the name of the Hiroshima University nor the names of
+ *       its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#if !defined CURAND_MTGP32_HOST_H
+#define CURAND_MTGP32_HOST_H
+#if !defined(QUALIFIERS)
+#define QUALIFIERS static inline __device__
+#endif
+#include <cuda_runtime.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <string.h>
+#include "curand.h"
+#include "curand_mtgp32.h"
+#include "curand_mtgp32dc_p_11213.h"
+/**
+ * \addtogroup DEVICE Device API
+ *
+ * @{
+ */
+static const unsigned int non_zero = 0x4d544750;
+/*
+ * This function represents a function used in the initialization
+ * by mtgp32_init_by_array() and mtgp32_init_by_str().
+ * @param[in] x 32-bit integer
+ * @return 32-bit integer
+ */
+static __forceinline__ unsigned int ini_func1(unsigned int x) {
+    return (x ^ (x >> 27)) * (1664525);
+}
+/*
+ * This function represents a function used in the initialization
+ * by mtgp32_init_by_array() and mtgp32_init_by_str().
+ * @param[in] x 32-bit integer
+ * @return 32-bit integer
+ */
+static __forceinline__ unsigned int ini_func2(unsigned int x) {
+    return (x ^ (x >> 27)) * (1566083941);
+}
+/*
+ * This function initializes the internal state array with a 32-bit
+ * integer seed. The allocated memory should be freed by calling
+ * mtgp32_free(). \b para should be one of the elements in the
+ * parameter table (mtgp32-param-ref.c).
+ *
+ * This function is call by cuda program, because cuda program uses
+ * another structure and another allocation method.
+ *
+ * @param[out] array MTGP internal status vector.
+ * @param[in] para parameter structure
+ * @param[in] seed a 32-bit integer used as the seed.
+ */
+static __forceinline__ __host__
+void mtgp32_init_state(unsigned int state[],
+                       const mtgp32_params_fast_t *para, unsigned int seed) {
+    int i;
+    int size = para->mexp / 32 + 1;
+    unsigned int hidden_seed;
+    unsigned int tmp;
+    hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
+    tmp = hidden_seed;
+    tmp += tmp >> 16;
+    tmp += tmp >> 8;
+    memset(state, tmp & 0xff, sizeof(unsigned int) * size);
+    state[0] = seed;
+    state[1] = hidden_seed;
+    for (i = 1; i < size; i++) {
+        state[i] ^= (1812433253) * (state[i - 1] ^ (state[i - 1] >> 30)) + i;
+    }
+}
+/*
+ * This function initializes the internal state array
+ * with a 32-bit integer array. \b para should be one of the elements in
+ * the parameter table (mtgp32-param-ref.c).
+ *
+ * @param[out] mtgp32 MTGP structure.
+ * @param[in] para parameter structure
+ * @param[in] array a 32-bit integer array used as a seed.
+ * @param[in] length length of the array.
+ * @return CURAND_STATUS_SUCCESS
+ */
+static __forceinline__ __host__
+int mtgp32_init_by_array(unsigned int state[],
+                         const mtgp32_params_fast_t *para,
+                         unsigned int *array, int length) {
+    int i, j, count;
+    unsigned int r;
+    int lag;
+    int mid;
+    int size = para->mexp / 32 + 1;
+    unsigned int hidden_seed;
+    unsigned int tmp;
+    if (size >= 623) {
+    lag = 11;
+    } else if (size >= 68) {
+    lag = 7;
+    } else if (size >= 39) {
+    lag = 5;
+    } else {
+    lag = 3;
+    }
+    mid = (size - lag) / 2;
+    hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
+    tmp = hidden_seed;
+    tmp += tmp >> 16;
+    tmp += tmp >> 8;
+    memset(state, tmp & 0xff, sizeof(unsigned int) * size);
+    state[0] = hidden_seed;
+    if (length + 1 > size) {
+    count = length + 1;
+    } else {
+    count = size;
+    }
+    r = ini_func1(state[0] ^ state[mid] ^ state[size - 1]);
+    state[mid] += r;
+    r += length;
+    state[(mid + lag) % size] += r;
+    state[0] = r;
+    i = 1;
+    count--;
+    for (i = 1, j = 0; (j < count) && (j < length); j++) {
+    r = ini_func1(state[i] ^ state[(i + mid) % size]
+              ^ state[(i + size - 1) % size]);
+    state[(i + mid) % size] += r;
+    r += array[j] + i;
+    state[(i + mid + lag) % size] += r;
+    state[i] = r;
+    i = (i + 1) % size;
+    }
+    for (; j < count; j++) {
+    r = ini_func1(state[i] ^ state[(i + mid) % size]
+              ^ state[(i + size - 1) % size]);
+    state[(i + mid) % size] += r;
+    r += i;
+    state[(i + mid + lag) % size] += r;
+    state[i] = r;
+    i = (i + 1) % size;
+    }
+    for (j = 0; j < size; j++) {
+    r = ini_func2(state[i] + state[(i + mid) % size]
+              + state[(i + size - 1) % size]);
+    state[(i + mid) % size] ^= r;
+    r -= i;
+    state[(i + mid + lag) % size] ^= r;
+    state[i] = r;
+    i = (i + 1) % size;
+    }
+    if (state[size - 1] == 0) {
+    state[size - 1] = non_zero;
+    }
+    return 0;
+}
+/*
+ * This function initializes the internal state array
+ * with a character array. \b para should be one of the elements in
+ * the parameter table (mtgp32-param-ref.c).
+ * This is the same algorithm with mtgp32_init_by_array(), but hope to
+ * be more useful.
+ *
+ * @param[out] mtgp32 MTGP structure.
+ * @param[in] para parameter structure
+ * @param[in] array a character array used as a seed. (terminated by zero.)
+ * @return memory allocation result. if 0 then O.K.
+ */
+static __forceinline__ __host__
+int mtgp32_init_by_str(unsigned int state[],
+                       const mtgp32_params_fast_t *para, unsigned char *array) {
+    int i, j, count;
+    unsigned int r;
+    int lag;
+    int mid;
+    int size = para->mexp / 32 + 1;
+    int length = (unsigned int)strlen((char *)array);
+    unsigned int hidden_seed;
+    unsigned int tmp;
+    if (size >= 623) {
+    lag = 11;
+    } else if (size >= 68) {
+    lag = 7;
+    } else if (size >= 39) {
+    lag = 5;
+    } else {
+    lag = 3;
+    }
+    mid = (size - lag) / 2;
+    hidden_seed = para->tbl[4] ^ (para->tbl[8] << 16);
+    tmp = hidden_seed;
+    tmp += tmp >> 16;
+    tmp += tmp >> 8;
+    memset(state, tmp & 0xff, sizeof(unsigned int) * size);
+    state[0] = hidden_seed;
+    if (length + 1 > size) {
+    count = length + 1;
+    } else {
+    count = size;
+    }
+    r = ini_func1(state[0] ^ state[mid] ^ state[size - 1]);
+    state[mid] += r;
+    r += length;
+    state[(mid + lag) % size] += r;
+    state[0] = r;
+    i = 1;
+    count--;
+    for (i = 1, j = 0; (j < count) && (j < length); j++) {
+    r = ini_func1(state[i] ^ state[(i + mid) % size]
+              ^ state[(i + size - 1) % size]);
+    state[(i + mid) % size] += r;
+    r += array[j] + i;
+    state[(i + mid + lag) % size] += r;
+    state[i] = r;
+    i = (i + 1) % size;
+    }
+    for (; j < count; j++) {
+    r = ini_func1(state[i] ^ state[(i + mid) % size]
+              ^ state[(i + size - 1) % size]);
+    state[(i + mid) % size] += r;
+    r += i;
+    state[(i + mid + lag) % size] += r;
+    state[i] = r;
+    i = (i + 1) % size;
+    }
+    for (j = 0; j < size; j++) {
+    r = ini_func2(state[i] + state[(i + mid) % size]
+              + state[(i + size - 1) % size]);
+    state[(i + mid) % size] ^= r;
+    r -= i;
+    state[(i + mid + lag) % size] ^= r;
+    state[i] = r;
+    i = (i + 1) % size;
+    }
+    if (state[size - 1] == 0) {
+    state[size - 1] = non_zero;
+    }
+    return 0;
+}
+template<typename ParamsType>
+static __forceinline__ __host__
+curandStatus_t curandMakeMTGP32ConstantsImpl(const mtgp32_params_fast_t params[], ParamsType * p, const int block_num)
+{
+    const int size1 = sizeof(unsigned int) * block_num;
+    const int size2 = sizeof(unsigned int) * block_num * TBL_SIZE;
+    unsigned int *h_pos_tbl;
+    unsigned int *h_sh1_tbl;
+    unsigned int *h_sh2_tbl;
+    unsigned int *h_param_tbl;
+    unsigned int *h_temper_tbl;
+    unsigned int *h_single_temper_tbl;
+    unsigned int *h_mask;
+    curandStatus_t status = CURAND_STATUS_SUCCESS;
+    h_pos_tbl = (unsigned int *)malloc(size1);
+    h_sh1_tbl = (unsigned int *)malloc(size1);
+    h_sh2_tbl = (unsigned int *)malloc(size1);
+    h_param_tbl = (unsigned int *)malloc(size2);
+    h_temper_tbl = (unsigned int *)malloc(size2);
+    h_single_temper_tbl = (unsigned int *)malloc(size2);
+    h_mask = (unsigned int *)malloc(sizeof(unsigned int));
+    if (h_pos_tbl == NULL
+	    || h_sh1_tbl == NULL
+	    || h_sh2_tbl == NULL
+	    || h_param_tbl == NULL
+	    || h_temper_tbl == NULL
+	    || h_single_temper_tbl == NULL
+	    || h_mask == NULL) {
+        if (h_pos_tbl != NULL) free(h_pos_tbl);
+        if (h_sh1_tbl != NULL) free(h_sh1_tbl);
+        if (h_sh2_tbl != NULL) free(h_sh2_tbl);
+        if (h_param_tbl != NULL) free(h_param_tbl);
+        if (h_temper_tbl != NULL) free(h_temper_tbl);
+        if (h_single_temper_tbl != NULL) free(h_single_temper_tbl);
+        if (h_mask != NULL) free(h_mask);
+        status = CURAND_STATUS_ALLOCATION_FAILED;
+    } else {
+        h_mask[0] = params[0].mask;
+        for (int i = 0; i < block_num; i++) {
+	        h_pos_tbl[i] = params[i].pos;
+	        h_sh1_tbl[i] = params[i].sh1;
+	        h_sh2_tbl[i] = params[i].sh2;
+	        for (int j = 0; j < TBL_SIZE; j++) {
+	            h_param_tbl[i * TBL_SIZE + j] = params[i].tbl[j];
+	            h_temper_tbl[i * TBL_SIZE + j] = params[i].tmp_tbl[j];
+	            h_single_temper_tbl[i * TBL_SIZE + j] = params[i].flt_tmp_tbl[j];
+	        }
+        }
+        if (cudaMemcpy( p->pos_tbl,
+                        h_pos_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        } else
+        if (cudaMemcpy( p->sh1_tbl,
+                        h_sh1_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        } else
+        if (cudaMemcpy( p->sh2_tbl,
+                        h_sh2_tbl, size1, cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        } else
+        if (cudaMemcpy( p->param_tbl,
+                        h_param_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        } else
+        if (cudaMemcpy( p->temper_tbl,
+                        h_temper_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        } else
+        if (cudaMemcpy( p->single_temper_tbl,
+                        h_single_temper_tbl, size2, cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        } else
+        if (cudaMemcpy( p->mask,
+                        h_mask, sizeof(unsigned int), cudaMemcpyHostToDevice) != cudaSuccess)
+        {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        }
+    }
+    if (h_pos_tbl != NULL) free(h_pos_tbl);
+    if (h_sh1_tbl != NULL) free(h_sh1_tbl);
+    if (h_sh2_tbl != NULL) free(h_sh2_tbl);
+    if (h_param_tbl != NULL) free(h_param_tbl);
+    if (h_temper_tbl != NULL) free(h_temper_tbl);
+    if (h_single_temper_tbl != NULL)free(h_single_temper_tbl);
+    if (h_mask != NULL) free(h_mask);
+    return status;
+}
+/**
+ * \brief Set up constant parameters for the mtgp32 generator
+ *
+ * This host-side helper function re-organizes CURAND_NUM_MTGP32_PARAMS sets of
+ * generator parameters for use by kernel functions and copies the
+ * result to the specified location in device memory.
+ *
+ * \param params - Pointer to an array of type mtgp32_params_fast_t in host memory
+ * \param p - pointer to a structure of type mtgp32_kernel_params_t in device memory.
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if host memory could not be allocated
+ * - CURAND_STATUS_INITIALIZATION_FAILED if the copy to device memory failed
+ * - CURAND_STATUS_SUCCESS otherwise
+ */
+static __forceinline__ __host__
+curandStatus_t curandMakeMTGP32Constants(const mtgp32_params_fast_t params[], mtgp32_kernel_params_t * p)
+{
+    return curandMakeMTGP32ConstantsImpl(params, p, CURAND_NUM_MTGP32_PARAMS);
+}
+/**
+ * \brief Set up initial states for the mtgp32 generator
+ *
+ * This host-side helper function initializes a number of states (one parameter set per state) for
+ * an mtgp32 generator. To accomplish this it allocates a state array in host memory,
+ * initializes that array, and copies the result to device memory.
+ *
+ * \param s - pointer to an array of states in device memory
+ * \param params - Pointer to an array of type mtgp32_params_fast_t in host memory
+ * \param k - pointer to a structure of type mtgp32_kernel_params_t in device memory
+ * \param n - number of parameter sets/states to initialize
+ * \param seed - seed value
+ *
+ * \return
+ * - CURAND_STATUS_ALLOCATION_FAILED if host memory state could not be allocated
+ * - CURAND_STATUS_INITIALIZATION_FAILED if the copy to device memory failed
+ * - CURAND_STATUS_SUCCESS otherwise
+ */
+static __forceinline__ __host__
+curandStatus_t CURANDAPI curandMakeMTGP32KernelState(curandStateMtgp32_t *s,
+                                                     mtgp32_params_fast_t params[],
+                                                     mtgp32_kernel_params_t *k,
+                                                     int n,
+                                                     unsigned long long seed)
+{
+    int i;
+    curandStatus_t status = CURAND_STATUS_SUCCESS;
+    curandStateMtgp32_t *h_status =(curandStateMtgp32_t *) malloc(sizeof(curandStateMtgp32_t) * n);
+    if (h_status == NULL) {
+        status = CURAND_STATUS_ALLOCATION_FAILED;
+    } else {
+        seed = seed ^ (seed >> 32);
+        for (i = 0; i < n; i++) {
+            mtgp32_init_state(&(h_status[i].s[0]), &params[i],(unsigned int)seed + i + 1);
+            h_status[i].offset = 0;
+            h_status[i].pIdx = i;
+            h_status[i].k = k;
+        }
+        if (cudaMemcpy(s, h_status,
+                       sizeof(curandStateMtgp32_t) * n,
+                       cudaMemcpyHostToDevice) != cudaSuccess) {
+            status = CURAND_STATUS_INITIALIZATION_FAILED;
+        }
+     }
+    free(h_status);
+    return status;
+}
+/** @} */
+#endif

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32_kernel.h ADDED Viewed

	@@ -0,0 +1,386 @@

+/*
+ * Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ * curand_mtgp32_kernel.h
+ *
+ *
+ * MTGP32-11213
+ *
+ * Mersenne Twister RNG for the GPU
+ *
+ * The period of generated integers is 2<sup>11213</sup>-1.
+ *
+ * This code generates 32-bit unsigned integers, and
+ * single precision floating point numbers uniformly distributed
+ * in the range [1, 2). (float r; 1.0 <= r < 2.0)
+ */
+/*
+ * Copyright (c) 2009, 2010 Mutsuo Saito, Makoto Matsumoto and Hiroshima
+ * University.  All rights reserved.
+ * Copyright (c) 2011 Mutsuo Saito, Makoto Matsumoto, Hiroshima
+ * University and University of Tokyo.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above
+ *       copyright notice, this list of conditions and the following
+ *       disclaimer in the documentation and/or other materials provided
+ *       with the distribution.
+ *     * Neither the name of the Hiroshima University nor the names of
+ *       its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written
+ *       permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#if !defined CURAND_MTGP32_KERNEL_H
+#define CURAND_MTGP32_KERNEL_H
+#if !defined(QUALIFIERS)
+#define QUALIFIERS static __forceinline__ __device__
+#endif
+#ifndef __CUDACC_RTC__
+#include <cuda_runtime.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <string.h>
+#endif // ifndef __CUDACC_RTC__
+#include <nv/target>
+#include "curand.h"
+#include "curand_mtgp32.h"
+/**
+ * \addtogroup DEVICE Device API
+ *
+ * @{
+ */
+#ifndef __CUDA_ARCH__
+// define blockDim and threadIdx for host compatibility call
+extern const dim3 blockDim;
+extern const uint3 threadIdx;
+#endif
+/*
+ * The function of the recursion formula calculation.
+ *
+ * @param[in] X1 the farthest part of state array.
+ * @param[in] X2 the second farthest part of state array.
+ * @param[in] Y a part of state array.
+ * @param[in] bid block id.
+ * @return output
+ */
+QUALIFIERS unsigned int para_rec(mtgp32_kernel_params_t * k,unsigned int X1, unsigned int X2, unsigned int Y, int bid) {
+    unsigned int X = (X1 & k->mask[0]) ^ X2;
+    unsigned int MAT;
+    X ^= X << k->sh1_tbl[bid];
+    Y = X ^ (Y >> k->sh2_tbl[bid]);
+    MAT = k->param_tbl[bid][Y & 0x0f];
+    return Y ^ MAT;
+}
+/*
+ * The tempering function.
+ *
+ * @param[in] V the output value should be tempered.
+ * @param[in] T the tempering helper value.
+ * @param[in] bid block id.
+ * @return the tempered value.
+ */
+QUALIFIERS unsigned int temper(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
+    unsigned int MAT;
+    T ^= T >> 16;
+    T ^= T >> 8;
+    MAT = k->temper_tbl[bid][T & 0x0f];
+    return V ^ MAT;
+}
+/*
+ * The tempering and converting function.
+ * By using the preset table, converting to IEEE format
+ * and tempering are done simultaneously.
+ *
+ * @param[in] V the output value should be tempered.
+ * @param[in] T the tempering helper value.
+ * @param[in] bid block id.
+ * @return the tempered and converted value.
+ */
+QUALIFIERS unsigned int temper_single(mtgp32_kernel_params_t * k,unsigned int V, unsigned int T, int bid) {
+    unsigned int MAT;
+    unsigned int r;
+    T ^= T >> 16;
+    T ^= T >> 8;
+    MAT = k->single_temper_tbl[bid][T & 0x0f];
+    r = (V >> 9) ^ MAT;
+    return r;
+}
+/**
+ * \brief Return 32-bits of pseudorandomness from a mtgp32 generator.
+ *
+ * Return 32-bits of pseudorandomness from the mtgp32 generator in \p state,
+ * increment position of generator by the number of threads in the block.
+ * Note the number of threads in the block can not exceed 256.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand(curandStateMtgp32_t *state)
+{
+    unsigned int t;
+    unsigned int d;
+    int pos = state->k->pos_tbl[state->pIdx];
+    unsigned int r;
+    unsigned int o;
+    d = blockDim.z * blockDim.y * blockDim.x;
+    //assert( d <= 256 );
+    t = (blockDim.z * blockDim.y * threadIdx.z) + (blockDim.x * threadIdx.y) + threadIdx.x;
+    r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
+             state->pIdx);
+    state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
+    o = temper(state->k, r,
+           state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
+           state->pIdx);
+NV_IF_TARGET(NV_IS_DEVICE,
+    __syncthreads();
+)
+    if (t == 0)
+    {
+        state->offset = (state->offset + d) & MTGP32_STATE_MASK;
+    }
+NV_IF_TARGET(NV_IS_DEVICE,
+    __syncthreads();
+)
+    return o;
+}
+/**
+ * \brief Return 32-bits of pseudorandomness from a specific position in a mtgp32 generator.
+ *
+ * Return 32-bits of pseudorandomness from position \p index of the mtgp32 generator in \p state,
+ * increment position of generator by \p n positions, which must be the total number of positions
+ * upddated in the state by the thread block, for this invocation.
+ *
+ * Note :
+ * Thread indices must range from 0...\ n - 1.
+ * The number of positions updated may not exceed 256.
+ * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
+ *
+ * \param state - Pointer to state to update
+ * \param index - Index (0..255) of the position within the state to draw from and update
+ * \param n - The total number of postions in this state that are being updated by this invocation
+ *
+ * \return 32-bits of pseudorandomness as an unsigned int, all bits valid to use.
+ */
+QUALIFIERS unsigned int curand_mtgp32_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
+{
+    unsigned int t;
+    int pos = state->k->pos_tbl[state->pIdx];
+    unsigned int r;
+    unsigned int o;
+    t = index;
+    r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
+             state->pIdx);
+    state->s[(t + state->offset + MTGPDC_N) & MTGP32_STATE_MASK] = r;
+    o = temper(state->k, r,
+           state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
+           state->pIdx);
+NV_IF_TARGET(NV_IS_DEVICE,
+    __syncthreads();
+)
+    if (index == 0)
+    {
+        state->offset = (state->offset + n) & MTGP32_STATE_MASK;
+    }
+NV_IF_TARGET(NV_IS_DEVICE,
+    __syncthreads();
+)
+    return o;
+}
+/**
+ * \brief Return a uniformly distributed float from a mtgp32 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the mtgp32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * Note: This alternate derivation of a uniform float is provided for completeness
+ * with the original source
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_mtgp32_single(curandStateMtgp32_t *state)
+{
+    unsigned int t;
+    unsigned int d;
+    int pos = state->k->pos_tbl[state->pIdx];
+    unsigned int r;
+    unsigned int o_u;
+    float o_f;
+    t = blockDim.z * blockDim.y;
+    d = t * blockDim.x;
+    //assert( d <= 256 );
+    t += threadIdx.x;
+    r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
+             state->pIdx);
+    state->s[t] = r;
+    o_u = temper_single(state->k, r,
+                        state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
+                        state->pIdx);
+NV_IF_TARGET(NV_IS_DEVICE,
+    __syncthreads();
+)
+    if (threadIdx.x == 0)
+    {
+        state->offset = (state->offset + d) & MTGP32_STATE_MASK;
+    }
+NV_IF_TARGET(NV_IS_DEVICE,
+    __syncthreads();
+)
+    memcpy(&o_f, &o_u, sizeof(o_u));
+    return o_f;
+}
+/**
+ * \brief Return a uniformly distributed float from a specific position in a mtgp32 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from position \p index of the mtgp32 generator in \p state, and
+ * increment position of generator by \p n positions, which must be the total number of positions
+ * upddated in the state by the thread block, for this invocation.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * Note 1:
+ * Thread indices must range from 0...\p n - 1.
+ * The number of positions updated may not exceed 256.
+ * A thread block may update more than one state, but a given state may not be updated by more than one thread block.
+ *
+ * Note 2: This alternate derivation of a uniform float is provided for completeness
+ * with the original source
+ *
+ * \param state - Pointer to state to update
+ * \param index - Index (0..255) of the position within the state to draw from and update
+ * \param n - The total number of postions in this state that are being updated by this invocation
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_mtgp32_single_specific(curandStateMtgp32_t *state, unsigned char index, unsigned char n)
+{
+    unsigned int t;
+    int pos = state->k->pos_tbl[state->pIdx];
+    unsigned int r;
+    unsigned int o_u;
+    float o_f;
+    t = index;
+    r = para_rec(state->k, state->s[(t + state->offset) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + 1) & MTGP32_STATE_MASK],
+             state->s[(t + state->offset + pos) & MTGP32_STATE_MASK],
+             state->pIdx);
+    state->s[t] = r;
+    o_u = temper_single(state->k, r,
+                        state->s[(t + state->offset + pos -1) & MTGP32_STATE_MASK],
+                        state->pIdx);
+NV_IF_TARGET(NV_IS_DEVICE,
+    __syncthreads();
+)
+    if (threadIdx.x == 0)
+    {
+        state->offset = (state->offset + n) & MTGP32_STATE_MASK;
+    }
+NV_IF_TARGET(NV_IS_DEVICE,
+    __syncthreads();
+)
+    memcpy(&o_f, &o_u, sizeof(o_u));
+    return o_f;
+}
+/** @} */
+#endif

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_mtgp32dc_p_11213.h ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal.h ADDED Viewed

	@@ -0,0 +1,840 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#if !defined(CURAND_NORMAL_H_)
+#define CURAND_NORMAL_H_
+/**
+ * \defgroup DEVICE Device API
+ *
+ * @{
+ */
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif // __CUDACC_RTC__
+#include <nv/target>
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_philox4x32_x.h"
+#include "curand_normal_static.h"
+QUALIFIERS float2 _curand_box_muller(unsigned int x, unsigned int y)
+{
+    float2 result;
+    float u = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2);
+    float v = y * CURAND_2POW32_INV_2PI + (CURAND_2POW32_INV_2PI/2);
+    float s;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    s = sqrtf(-2.0f * logf(u));
+    __sincosf(v, &result.x, &result.y);
+,
+    s = sqrtf(-2.0f * logf(u));
+    result.x = sinf(v);
+    result.y = cosf(v);
+)
+    result.x *= s;
+    result.y *= s;
+    return result;
+}
+QUALIFIERS float2 curand_box_muller_mrg(curandStateMRG32k3a_t * state)
+{
+    float x, y;
+    x = curand_uniform(state);
+    y = curand_uniform(state) * CURAND_2PI;
+    float2 result;
+    float s;
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    s = sqrtf(-2.0f * logf(x));
+    __sincosf(y, &result.x, &result.y);
+,
+    s = sqrtf(-2.0f * logf(x));
+    result.x = sinf(y);
+    result.y = cosf(y);
+)
+    result.x *= s;
+    result.y *= s;
+    return result;
+}
+QUALIFIERS double2
+_curand_box_muller_double(unsigned int x0, unsigned int x1,
+                          unsigned int y0, unsigned int y1)
+{
+    double2 result;
+    unsigned long long zx = (unsigned long long)x0 ^
+        ((unsigned long long)x1 << (53 - 32));
+    double u = zx * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
+    unsigned long long zy = (unsigned long long)y0 ^
+        ((unsigned long long)y1 << (53 - 32));
+    double v = zy * (CURAND_2POW53_INV_DOUBLE*2.0) + CURAND_2POW53_INV_DOUBLE;
+    double s = sqrt(-2.0 * log(u));
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    sincospi(v, &result.x, &result.y);
+,
+    result.x = sin(v*CURAND_PI_DOUBLE);
+    result.y = cos(v*CURAND_PI_DOUBLE);
+)
+    result.x *= s;
+    result.y *= s;
+    return result;
+}
+QUALIFIERS double2
+curand_box_muller_mrg_double(curandStateMRG32k3a_t * state)
+{
+    double x, y;
+    double2 result;
+    x = curand_uniform_double(state);
+    y = curand_uniform_double(state) * 2.0;
+    double s = sqrt(-2.0 * log(x));
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    sincospi(y, &result.x, &result.y);
+,
+    result.x = sin(y*CURAND_PI_DOUBLE);
+    result.y = cos(y*CURAND_PI_DOUBLE);
+)
+    result.x *= s;
+    result.y *= s;
+    return result;
+}
+template <typename R>
+QUALIFIERS float2 curand_box_muller(R *state)
+{
+    float2 result;
+    unsigned int x = curand(state);
+    unsigned int y = curand(state);
+    result = _curand_box_muller(x, y);
+    return result;
+}
+template <typename R>
+QUALIFIERS float4 curand_box_muller4(R *state)
+{
+    float4 result;
+    float2 _result;
+    uint4 x = curand4(state);
+    //unsigned int y = curand(state);
+    _result = _curand_box_muller(x.x, x.y);
+    result.x = _result.x;
+    result.y = _result.y;
+    _result = _curand_box_muller(x.z, x.w);
+    result.z = _result.x;
+    result.w = _result.y;
+    return result;
+}
+template <typename R>
+QUALIFIERS double2 curand_box_muller_double(R *state)
+{
+    double2 result;
+    unsigned int x0 = curand(state);
+    unsigned int x1 = curand(state);
+    unsigned int y0 = curand(state);
+    unsigned int y1 = curand(state);
+    result = _curand_box_muller_double(x0, x1, y0, y1);
+    return result;
+}
+template <typename R>
+QUALIFIERS double2 curand_box_muller2_double(R *state)
+{
+    double2 result;
+    uint4 _x;
+    _x = curand4(state);
+    result = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
+    return result;
+}
+template <typename R>
+QUALIFIERS double4 curand_box_muller4_double(R *state)
+{
+    double4 result;
+    double2 _res1;
+    double2 _res2;
+    uint4 _x;
+    uint4 _y;
+    _x = curand4(state);
+    _y = curand4(state);
+    _res1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
+    _res2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
+    result.x = _res1.x;
+    result.y = _res1.y;
+    result.z = _res2.x;
+    result.w = _res2.y;
+    return result;
+}
+//QUALIFIERS float _curand_normal_icdf(unsigned int x)
+//{
+//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
+//    float s = CURAND_SQRT2;
+//    // Mirror to avoid loss of precision
+//    if(x > 0x80000000UL) {
+//        x = 0xffffffffUL - x;
+//        s = -s;
+//    }
+//    float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+//    // p is in (0, 0.5], 2p is in (0, 1]
+//    return s * erfcinvf(2.0f * p);
+//#else
+//    x++;    //suppress warnings
+//    return 0.0f;
+//#endif
+//}
+//
+//QUALIFIERS float _curand_normal_icdf(unsigned long long x)
+//{
+//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
+//    unsigned int t = (unsigned int)(x >> 32);
+//    float s = CURAND_SQRT2;
+//    // Mirror to avoid loss of precision
+//    if(t > 0x80000000UL) {
+//        t = 0xffffffffUL - t;
+//        s = -s;
+//    }
+//    float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+//    // p is in (0, 0.5], 2p is in (0, 1]
+//    return s * erfcinvf(2.0f * p);
+//#else
+//    x++;
+//    return 0.0f;
+//#endif
+//}
+//
+//QUALIFIERS double _curand_normal_icdf_double(unsigned int x)
+//{
+//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
+//    double s = CURAND_SQRT2_DOUBLE;
+//    // Mirror to avoid loss of precision
+//    if(x > 0x80000000UL) {
+//        x = 0xffffffffUL - x;
+//        s = -s;
+//    }
+//    double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
+//    // p is in (0, 0.5], 2p is in (0, 1]
+//    return s * erfcinv(2.0 * p);
+//#else
+//    x++;
+//    return 0.0;
+//#endif
+//}
+//
+//QUALIFIERS double _curand_normal_icdf_double(unsigned long long x)
+//{
+//#if __CUDA_ARCH__ > 0 || defined(HOST_HAVE_ERFCINVF)
+//    double s = CURAND_SQRT2_DOUBLE;
+//    x >>= 11;
+//    // Mirror to avoid loss of precision
+//    if(x > 0x10000000000000UL) {
+//        x = 0x1fffffffffffffUL - x;
+//        s = -s;
+//    }
+//    double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
+//    // p is in (0, 0.5], 2p is in (0, 1]
+//    return s * erfcinv(2.0 * p);
+//#else
+//    x++;
+//    return 0.0;
+//#endif
+//}
+//
+/**
+ * \brief Return a normally distributed float from an XORWOW generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the XORWOW generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then returns them one at a time.
+ * See ::curand_normal2() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateXORWOW_t *state)
+{
+    if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
+        unsigned int x, y;
+        x = curand(state);
+        y = curand(state);
+        float2 v = _curand_box_muller(x, y);
+        state->boxmuller_extra = v.y;
+        state->boxmuller_flag = EXTRA_FLAG_NORMAL;
+        return v.x;
+    }
+    state->boxmuller_flag = 0;
+    return state->boxmuller_extra;
+}
+/**
+ * \brief Return a normally distributed float from an Philox4_32_10 generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then returns them one at a time.
+ * See ::curand_normal2() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStatePhilox4_32_10_t *state)
+{
+    if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
+        unsigned int x, y;
+        x = curand(state);
+        y = curand(state);
+        float2 v = _curand_box_muller(x, y);
+        state->boxmuller_extra = v.y;
+        state->boxmuller_flag = EXTRA_FLAG_NORMAL;
+        return v.x;
+    }
+    state->boxmuller_flag = 0;
+    return state->boxmuller_extra;
+}
+/**
+ * \brief Return a normally distributed float from an MRG32k3a generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the MRG32k3a generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then returns them one at a time.
+ * See ::curand_normal2() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateMRG32k3a_t *state)
+{
+    if(state->boxmuller_flag != EXTRA_FLAG_NORMAL) {
+        float2 v = curand_box_muller_mrg(state);
+        state->boxmuller_extra = v.y;
+        state->boxmuller_flag = EXTRA_FLAG_NORMAL;
+        return v.x;
+    }
+    state->boxmuller_flag = 0;
+    return state->boxmuller_extra;
+}
+/**
+ * \brief Return two normally distributed floats from an XORWOW generator.
+ *
+ * Return two normally distributed floats with mean \p 0.0f and
+ * standard deviation \p 1.0f from the XORWOW generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float2 where each element is from a
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float2 curand_normal2(curandStateXORWOW_t *state)
+{
+    return curand_box_muller(state);
+}
+/**
+ * \brief Return two normally distributed floats from an Philox4_32_10 generator.
+ *
+ * Return two normally distributed floats with mean \p 0.0f and
+ * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float2 where each element is from a
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float2 curand_normal2(curandStatePhilox4_32_10_t *state)
+{
+    return curand_box_muller(state);
+}
+/**
+ * \brief Return four normally distributed floats from an Philox4_32_10 generator.
+ *
+ * Return four normally distributed floats with mean \p 0.0f and
+ * standard deviation \p 1.0f from the Philox4_32_10 generator in \p state,
+ * increment position of generator by four.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float2 where each element is from a
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float4 curand_normal4(curandStatePhilox4_32_10_t *state)
+{
+    return curand_box_muller4(state);
+}
+/**
+ * \brief Return two normally distributed floats from an MRG32k3a generator.
+ *
+ * Return two normally distributed floats with mean \p 0.0f and
+ * standard deviation \p 1.0f from the MRG32k3a generator in \p state,
+ * increment position of generator by two.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float2 where each element is from a
+ * distribution with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float2 curand_normal2(curandStateMRG32k3a_t *state)
+{
+    return curand_box_muller_mrg(state);
+}
+/**
+ * \brief Return a normally distributed float from a MTGP32 generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the MTGP32 generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateMtgp32_t *state)
+{
+    return _curand_normal_icdf(curand(state));
+}
+/**
+ * \brief Return a normally distributed float from a Sobol32 generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateSobol32_t *state)
+{
+    return _curand_normal_icdf(curand(state));
+}
+/**
+ * \brief Return a normally distributed float from a scrambled Sobol32 generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateScrambledSobol32_t *state)
+{
+    return _curand_normal_icdf(curand(state));
+}
+/**
+ * \brief Return a normally distributed float from a Sobol64 generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateSobol64_t *state)
+{
+    return _curand_normal_icdf(curand(state));
+}
+/**
+ * \brief Return a normally distributed float from a scrambled Sobol64 generator.
+ *
+ * Return a single normally distributed float with mean \p 0.0f and
+ * standard deviation \p 1.0f from the scrambled Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed float with mean \p 0.0f and standard deviation \p 1.0f
+ */
+QUALIFIERS float curand_normal(curandStateScrambledSobol64_t *state)
+{
+    return _curand_normal_icdf(curand(state));
+}
+/**
+ * \brief Return a normally distributed double from an XORWOW generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the XORWOW generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then returns them one at a time.
+ * See ::curand_normal2_double() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateXORWOW_t *state)
+{
+    if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
+        unsigned int x0, x1, y0, y1;
+        x0 = curand(state);
+        x1 = curand(state);
+        y0 = curand(state);
+        y1 = curand(state);
+        double2 v = _curand_box_muller_double(x0, x1, y0, y1);
+        state->boxmuller_extra_double = v.y;
+        state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
+        return v.x;
+    }
+    state->boxmuller_flag_double = 0;
+    return state->boxmuller_extra_double;
+}
+/**
+ * \brief Return a normally distributed double from an Philox4_32_10 generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then returns them one at a time.
+ * See ::curand_normal2_double() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStatePhilox4_32_10_t *state)
+{
+    if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
+        uint4 _x;
+        _x = curand4(state);
+        double2 v = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
+        state->boxmuller_extra_double = v.y;
+        state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
+        return v.x;
+    }
+    state->boxmuller_flag_double = 0;
+    return state->boxmuller_extra_double;
+}
+/**
+ * \brief Return a normally distributed double from an MRG32k3a generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the XORWOW generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results, then returns them one at a time.
+ * See ::curand_normal2_double() for a more efficient version that returns
+ * both results at once.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateMRG32k3a_t *state)
+{
+    if(state->boxmuller_flag_double != EXTRA_FLAG_NORMAL) {
+        double2 v = curand_box_muller_mrg_double(state);
+        state->boxmuller_extra_double = v.y;
+        state->boxmuller_flag_double = EXTRA_FLAG_NORMAL;
+        return v.x;
+    }
+    state->boxmuller_flag_double = 0;
+    return state->boxmuller_extra_double;
+}
+/**
+ * \brief Return two normally distributed doubles from an XORWOW generator.
+ *
+ * Return two normally distributed doubles with mean \p 0.0 and
+ * standard deviation \p 1.0 from the XORWOW generator in \p state,
+ * increment position of generator by 2.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double2 where each element is from a
+ * distribution with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double2 curand_normal2_double(curandStateXORWOW_t *state)
+{
+    return curand_box_muller_double(state);
+}
+/**
+ * \brief Return two normally distributed doubles from an Philox4_32_10 generator.
+ *
+ * Return two normally distributed doubles with mean \p 0.0 and
+ * standard deviation \p 1.0 from the Philox4_32_10 generator in \p state,
+ * increment position of generator by 2.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double2 where each element is from a
+ * distribution with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double2 curand_normal2_double(curandStatePhilox4_32_10_t *state)
+{
+    uint4 _x;
+    double2 result;
+    _x = curand4(state);
+    double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
+    result.x = v1.x;
+    result.y = v1.y;
+    return result;
+}
+ // not a part of API
+QUALIFIERS double4 curand_normal4_double(curandStatePhilox4_32_10_t *state)
+{
+    uint4 _x;
+    uint4 _y;
+    double4 result;
+    _x = curand4(state);
+    _y = curand4(state);
+    double2 v1 = _curand_box_muller_double(_x.x, _x.y, _x.z, _x.w);
+    double2 v2 = _curand_box_muller_double(_y.x, _y.y, _y.z, _y.w);
+    result.x = v1.x;
+    result.y = v1.y;
+    result.z = v2.x;
+    result.w = v2.y;
+    return result;
+}
+/**
+ * \brief Return two normally distributed doubles from an MRG32k3a generator.
+ *
+ * Return two normally distributed doubles with mean \p 0.0 and
+ * standard deviation \p 1.0 from the MRG32k3a generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses a Box-Muller transform to generate two
+ * normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double2 where each element is from a
+ * distribution with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double2 curand_normal2_double(curandStateMRG32k3a_t *state)
+{
+    return curand_box_muller_mrg_double(state);
+}
+/**
+ * \brief Return a normally distributed double from an MTGP32 generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the MTGP32 generator in \p state,
+ * increment position of generator.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateMtgp32_t *state)
+{
+    return _curand_normal_icdf_double(curand(state));
+}
+/**
+ * \brief Return a normally distributed double from an Sobol32 generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateSobol32_t *state)
+{
+    return _curand_normal_icdf_double(curand(state));
+}
+/**
+ * \brief Return a normally distributed double from a scrambled Sobol32 generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the scrambled Sobol32 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateScrambledSobol32_t *state)
+{
+    return _curand_normal_icdf_double(curand(state));
+}
+/**
+ * \brief Return a normally distributed double from a Sobol64 generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateSobol64_t *state)
+{
+    return _curand_normal_icdf_double(curand(state));
+}
+/**
+ * \brief Return a normally distributed double from a scrambled Sobol64 generator.
+ *
+ * Return a single normally distributed double with mean \p 0.0 and
+ * standard deviation \p 1.0 from the scrambled Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * The implementation uses the inverse cumulative distribution function
+ * to generate normally distributed results.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return Normally distributed double with mean \p 0.0 and standard deviation \p 1.0
+ */
+QUALIFIERS double curand_normal_double(curandStateScrambledSobol64_t *state)
+{
+    return _curand_normal_icdf_double(curand(state));
+}
+#endif // !defined(CURAND_NORMAL_H_)

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_normal_static.h ADDED Viewed

	@@ -0,0 +1,134 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef CURAND_NORMAL_STATIC_H
+#define CURAND_NORMAL_STATIC_H
+#define QUALIFIERS_STATIC __host__ __device__ __forceinline__
+#include <nv/target>
+#if defined(HOST_HAVE_ERFCINVF)
+  #define IF_DEVICE_OR_HAVE_ERFCINVF(t, f) _NV_BLOCK_EXPAND(t)
+#else
+  #define IF_DEVICE_OR_HAVE_ERFCINVF(t, f) NV_IF_ELSE_TARGET(NV_IS_DEVICE, t, f)
+#endif
+QUALIFIERS_STATIC float _curand_normal_icdf(unsigned int x)
+{
+IF_DEVICE_OR_HAVE_ERFCINVF(
+    float s = CURAND_SQRT2;
+    // Mirror to avoid loss of precision
+    if(x > 0x80000000UL) {
+        x = 0xffffffffUL - x;
+        s = -s;
+    }
+    float p = x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+    // p is in (0, 0.5], 2p is in (0, 1]
+    return s * erfcinvf(2.0f * p);
+,
+    x++;    //suppress warnings
+    return 0.0f;
+)
+}
+QUALIFIERS_STATIC float _curand_normal_icdf(unsigned long long x)
+{
+IF_DEVICE_OR_HAVE_ERFCINVF(
+    unsigned int t = (unsigned int)(x >> 32);
+    float s = CURAND_SQRT2;
+    // Mirror to avoid loss of precision
+    if(t > 0x80000000UL) {
+        t = 0xffffffffUL - t;
+        s = -s;
+    }
+    float p = t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+    // p is in (0 - 0.5] 2p is in (0 - 1]
+    return s * erfcinvf(2.0f * p);
+,
+    x++;
+    return 0.0f;
+)
+}
+QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned int x)
+{
+IF_DEVICE_OR_HAVE_ERFCINVF(
+    double s = CURAND_SQRT2_DOUBLE;
+    // Mirror to avoid loss of precision
+    if(x > 0x80000000UL) {
+        x = 0xffffffffUL - x;
+        s = -s;
+    }
+    double p = x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
+    // p is in (0 - 0.5] 2p is in (0 - 1]
+    return s * erfcinv(2.0 * p);
+,
+    x++;
+    return 0.0;
+)
+}
+QUALIFIERS_STATIC double _curand_normal_icdf_double(unsigned long long x)
+{
+IF_DEVICE_OR_HAVE_ERFCINVF(
+    double s = CURAND_SQRT2_DOUBLE;
+    x >>= 11;
+    // Mirror to avoid loss of precision
+    if(x > 0x10000000000000UL) {
+        x = 0x1fffffffffffffUL - x;
+        s = -s;
+    }
+    double p = x * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
+    // p is in (0 - 0.5] 2p is in (0 - 1]
+    return s * erfcinv(2.0 * p);
+,
+    x++;
+    return 0.0;
+)
+}
+#undef QUALIFIERS_STATIC
+#endif

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_philox4x32_x.h ADDED Viewed

	@@ -0,0 +1,195 @@

+/* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * The source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * The Licensed Deliverables contained herein are PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+   Copyright 2010-2011, D. E. Shaw Research.
+   All rights reserved.
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions are
+met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions, and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions, and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of D. E. Shaw Research nor the names of its
+ contributors may be used to endorse or promote products derived from
+ this software without specific prior written permission.
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef CURAND_PHILOX4X32_X__H_
+#define CURAND_PHILOX4X32_X__H_
+#include <nv/target>
+#if !defined(QUALIFIERS)
+#define QUALIFIERS static __forceinline__ __device__
+#endif
+#define PHILOX_W32_0   (0x9E3779B9)
+#define PHILOX_W32_1   (0xBB67AE85)
+#define PHILOX_M4x32_0 (0xD2511F53)
+#define PHILOX_M4x32_1 (0xCD9E8D57)
+struct curandStatePhilox4_32_10 {
+   uint4 ctr;
+   uint4 output;
+   uint2 key;
+   unsigned int STATE;
+   int boxmuller_flag;
+   int boxmuller_flag_double;
+   float boxmuller_extra;
+   double boxmuller_extra_double;
+};
+typedef struct curandStatePhilox4_32_10 curandStatePhilox4_32_10_t;
+QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s, unsigned long long n)
+{
+   unsigned int nlo = (unsigned int)(n);
+   unsigned int nhi = (unsigned int)(n>>32);
+   s->ctr.x += nlo;
+   if( s->ctr.x < nlo )
+      nhi++;
+   s->ctr.y += nhi;
+   if(nhi <= s->ctr.y)
+      return;
+   if(++s->ctr.z) return;
+   ++s->ctr.w;
+}
+QUALIFIERS void Philox_State_Incr_hi(curandStatePhilox4_32_10_t* s, unsigned long long n)
+{
+   unsigned int nlo = (unsigned int)(n);
+   unsigned int nhi = (unsigned int)(n>>32);
+   s->ctr.z += nlo;
+   if( s->ctr.z < nlo )
+      nhi++;
+   s->ctr.w += nhi;
+}
+QUALIFIERS void Philox_State_Incr(curandStatePhilox4_32_10_t* s)
+{
+   if(++s->ctr.x) return;
+   if(++s->ctr.y) return;
+   if(++s->ctr.z) return;
+   ++s->ctr.w;
+}
+QUALIFIERS unsigned int mulhilo32(unsigned int a, unsigned int b, unsigned int* hip)
+{
+NV_IF_ELSE_TARGET(NV_IS_HOST,
+   // host code
+   unsigned long long product = ((unsigned long long)a) * ((unsigned long long)b);
+   *hip = product >> 32;
+   return (unsigned int)product;
+,
+   // device code
+   *hip = __umulhi(a,b);
+   return a*b;
+)
+}
+QUALIFIERS uint4 _philox4x32round(uint4 ctr, uint2 key)
+{
+   unsigned int hi0;
+   unsigned int hi1;
+   unsigned int lo0 = mulhilo32(PHILOX_M4x32_0, ctr.x, &hi0);
+   unsigned int lo1 = mulhilo32(PHILOX_M4x32_1, ctr.z, &hi1);
+   uint4 ret  = {hi1^ctr.y^key.x, lo1, hi0^ctr.w^key.y, lo0};
+   return ret;
+}
+QUALIFIERS uint4 curand_Philox4x32_10( uint4 c, uint2 k)
+{
+   c = _philox4x32round(c, k);                           // 1
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 2
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 3
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 4
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 5
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 6
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 7
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 8
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   c = _philox4x32round(c, k);                           // 9
+   k.x += PHILOX_W32_0; k.y += PHILOX_W32_1;
+   return _philox4x32round(c, k);                        // 10
+}
+#endif

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_poisson.h ADDED Viewed

	@@ -0,0 +1,763 @@

+ /* Copyright 2010-2014 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#if !defined(CURAND_POISSON_H_)
+#define CURAND_POISSON_H_
+/**
+ * \defgroup DEVICE Device API
+ *
+ * @{
+ */
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif // __CUDACC_RTC__
+#include <nv/target>
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_philox4x32_x.h"
+#define CR_CUDART_PI               3.1415926535897931e+0
+#define CR_CUDART_TWO_TO_52        4503599627370496.0
+QUALIFIERS float __cr_rsqrt(float a)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm ("rsqrt.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
+,
+    a = 1.0f / sqrtf (a);
+)
+    return a;
+}
+QUALIFIERS float __cr_exp (float a)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    a = a * 1.4426950408889634074;
+    asm ("ex2.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
+,
+    a = expf (a);
+)
+    return a;
+}
+QUALIFIERS float __cr_log (float a)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm ("lg2.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
+    a = a * 0.69314718055994530942;
+,
+    a = logf (a);
+)
+    return a;
+}
+QUALIFIERS float __cr_rcp (float a)
+{
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+    asm ("rcp.approx.f32.ftz %0, %1;" : "=f"(a) : "f"(a));
+,
+    a = 1.0f / a;
+)
+    return a;
+}
+/* Computes regularized gamma function:  gammainc(a,x)/gamma(a) */
+QUALIFIERS float __cr_pgammainc (float a, float x)
+{
+    float t, alpha, beta;
+    /* First level parametrization constants */
+    float ma1 = 1.43248035075540910f,
+          ma2 = 0.12400979329415655f,
+          ma3 = 0.00025361074907033f,
+          mb1 = 0.21096734870196546f,
+          mb2 = 1.97381164089999420f,
+          mb3 = 0.94201734077887530f;
+    /* Second level parametrization constants (depends only on a) */
+    alpha = __cr_rsqrt (a - ma2);
+    alpha = ma1 * alpha + ma3;
+    beta = __cr_rsqrt (a - mb2);
+    beta = mb1 * beta + mb3;
+    /* Final approximation (depends on a and x) */
+    t = a - x;
+    t = alpha * t - beta;
+    t = 1.0f + __cr_exp (t);
+    t = t * t;
+    t = __cr_rcp (t);
+    /* Negative a,x or a,x=NAN requires special handling */
+    //t = !(x > 0 && a >= 0) ? 0.0 : t;
+    return t;
+}
+/* Computes inverse of pgammainc */
+QUALIFIERS float __cr_pgammaincinv (float a, float y)
+{
+    float t, alpha, beta;
+    /* First level parametrization constants */
+    float ma1 = 1.43248035075540910f,
+          ma2 = 0.12400979329415655f,
+          ma3 = 0.00025361074907033f,
+          mb1 = 0.21096734870196546f,
+          mb2 = 1.97381164089999420f,
+          mb3 = 0.94201734077887530f;
+    /* Second level parametrization constants (depends only on a) */
+    alpha = __cr_rsqrt (a - ma2);
+    alpha = ma1 * alpha + ma3;
+    beta = __cr_rsqrt (a - mb2);
+    beta = mb1 * beta + mb3;
+    /* Final approximation (depends on a and y) */
+    t = __cr_rsqrt (y) - 1.0f;
+    t = __cr_log (t);
+    t = beta + t;
+    t = - t * __cr_rcp (alpha) + a;
+    /* Negative a,x or a,x=NAN requires special handling */
+    //t = !(y > 0 && a >= 0) ? 0.0 : t;
+    return t;
+}
+#if defined(__CUDACC_RDC__) && (__cplusplus >= 201703L) && defined(__cpp_inline_variables)
+inline __constant__ double __cr_lgamma_table [] = {
+#else
+static __constant__ double __cr_lgamma_table [] = {
+#endif
+    0.000000000000000000e-1,
+    0.000000000000000000e-1,
+    6.931471805599453094e-1,
+    1.791759469228055001e0,
+    3.178053830347945620e0,
+    4.787491742782045994e0,
+    6.579251212010100995e0,
+    8.525161361065414300e0,
+    1.060460290274525023e1
+};
+QUALIFIERS double __cr_lgamma_integer(int a)
+{
+    double s;
+    double t;
+    double fa = fabs((float)a);
+    double sum;
+    if (a > 8) {
+        /* Stirling approximation; coefficients from Hart et al, "Computer
+         * Approximations", Wiley 1968. Approximation 5404.
+         */
+        s = 1.0 / fa;
+        t = s * s;
+        sum =          -0.1633436431e-2;
+        sum = sum * t + 0.83645878922e-3;
+        sum = sum * t - 0.5951896861197e-3;
+        sum = sum * t + 0.793650576493454e-3;
+        sum = sum * t - 0.277777777735865004e-2;
+        sum = sum * t + 0.833333333333331018375e-1;
+        sum = sum * s + 0.918938533204672;
+        s = 0.5 * log (fa);
+        t = fa - 0.5;
+        s = s * t;
+        t = s - fa;
+        s = s + sum;
+        t = t + s;
+        return t;
+    } else {
+NV_IF_ELSE_TARGET(NV_IS_DEVICE,
+        return __cr_lgamma_table [(int) fa-1];
+,
+        switch(a) {
+            case 1: return 0.000000000000000000e-1;
+            case 2: return 0.000000000000000000e-1;
+            case 3: return 6.931471805599453094e-1;
+            case 4: return 1.791759469228055001e0;
+            case 5: return 3.178053830347945620e0;
+            case 6: return 4.787491742782045994e0;
+            case 7: return 6.579251212010100995e0;
+            case 8: return 8.525161361065414300e0;
+            default: return 1.060460290274525023e1;
+        }
+)
+    }
+}
+#define KNUTH_FLOAT_CONST 60.0
+template <typename T>
+// Donald E. Knuth Seminumerical Algorithms. The Art of Computer Programming, Volume 2
+QUALIFIERS unsigned int curand_poisson_knuth(T *state, float lambda)
+{
+  unsigned int k = 0;
+  float p = expf(lambda);
+  do{
+      k++;
+      p *= curand_uniform(state);
+  }while (p > 1.0);
+  return k-1;
+}
+template <typename T>
+// Donald E. Knuth Seminumerical Algorithms. The Art of Computer Programming, Volume 2
+QUALIFIERS uint4 curand_poisson_knuth4(T *state, float lambda)
+{
+  uint4 k = {0,0,0,0};
+  float exp_lambda = expf(lambda);
+  float4 p={ exp_lambda,exp_lambda,exp_lambda,exp_lambda };
+  do{
+      k.x++;
+      p.x *= curand_uniform(state);
+  }while (p.x > 1.0);
+  do{
+      k.y++;
+      p.y *= curand_uniform(state);
+  }while (p.y > 1.0);
+  do{
+      k.z++;
+      p.z *= curand_uniform(state);
+  }while (p.z > 1.0);
+  do{
+      k.w++;
+      p.w *= curand_uniform(state);
+  }while (p.w > 1.0);
+  k.x--;
+  k.y--;
+  k.z--;
+  k.w--;
+  return k;
+}
+template <typename T>
+// Marsaglia, Tsang, Wang Journal of Statistical Software, square histogram.
+QUALIFIERS unsigned int _curand_M2_double(T x, curandDistributionM2Shift_t distributionM2)
+{
+    double u = _curand_uniform_double(x);
+    int j = (int) floor(distributionM2->length*u);
+    double histogramVj;
+    unsigned int histogramKj;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_35,
+    histogramVj = __ldg( &(distributionM2->histogram->V[j]));
+    histogramKj = __ldg( &(distributionM2->histogram->K[j]));
+,
+    histogramVj = distributionM2->histogram->V[j];
+    histogramKj = distributionM2->histogram->K[j];
+)
+    //if (u < distributionM2->histogram->V[j]) return distributionM2->shift + j;
+    //return distributionM2->shift + distributionM2->histogram->K[j];
+    if (u < histogramVj) return distributionM2->shift + j;
+    return distributionM2->shift + histogramKj;
+}
+template <typename T>
+// Marsaglia, Tsang, Wang Journal of Statistical Software, square histogram.
+QUALIFIERS uint4 _curand_M2_double4(T x, curandDistributionM2Shift_t distributionM2)
+{
+    double4 u;
+    uint4 result = {0,0,0,0};
+    int4 flag = {1,1,1,1};
+    u.x = _curand_uniform_double(x.x);
+    u.y = _curand_uniform_double(x.y);
+    u.z = _curand_uniform_double(x.z);
+    u.w = _curand_uniform_double(x.w);
+    int4 j;
+    j.x = (int) floor(distributionM2->length*u.x);
+    j.y = (int) floor(distributionM2->length*u.y);
+    j.z = (int) floor(distributionM2->length*u.z);
+    j.w = (int) floor(distributionM2->length*u.w);
+//    int result;
+    double histogramVjx;
+    double histogramVjy;
+    double histogramVjz;
+    double histogramVjw;
+    unsigned int histogramKjx;
+    unsigned int histogramKjy;
+    unsigned int histogramKjz;
+    unsigned int histogramKjw;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_35,
+    histogramVjx =  __ldg( &(distributionM2->histogram->V[j.x]));
+    histogramVjy =  __ldg( &(distributionM2->histogram->V[j.y]));
+    histogramVjz =  __ldg( &(distributionM2->histogram->V[j.z]));
+    histogramVjw =  __ldg( &(distributionM2->histogram->V[j.w]));
+    histogramKjx = __ldg( &(distributionM2->histogram->K[j.x]));
+    histogramKjy = __ldg( &(distributionM2->histogram->K[j.y]));
+    histogramKjz = __ldg( &(distributionM2->histogram->K[j.z]));
+    histogramKjw = __ldg( &(distributionM2->histogram->K[j.w]));
+,
+    histogramVjx =  distributionM2->histogram->V[j.x];
+    histogramVjy =  distributionM2->histogram->V[j.y];
+    histogramVjz =  distributionM2->histogram->V[j.z];
+    histogramVjw =  distributionM2->histogram->V[j.w];
+    histogramKjx = distributionM2->histogram->K[j.x];
+    histogramKjy = distributionM2->histogram->K[j.y];
+    histogramKjz = distributionM2->histogram->K[j.z];
+    histogramKjw = distributionM2->histogram->K[j.w];
+)
+    if (u.x < histogramVjx){ result.x = distributionM2->shift + j.x; flag.x = 0; }
+    if (u.y < histogramVjy){ result.y = distributionM2->shift + j.y; flag.y = 0; }
+    if (u.z < histogramVjz){ result.z = distributionM2->shift + j.z; flag.z = 0; }
+    if (u.w < histogramVjw){ result.w = distributionM2->shift + j.w; flag.w = 0; }
+    //return distributionM2->shift + distributionM2->histogram->K[j];
+    if(flag.x) result.x = distributionM2->shift + histogramKjx;
+    if(flag.y) result.y = distributionM2->shift + histogramKjy;
+    if(flag.z) result.z = distributionM2->shift + histogramKjz;
+    if(flag.w) result.w = distributionM2->shift + histogramKjw;
+    return result;
+}
+template <typename STATE>
+QUALIFIERS unsigned int curand_M2_double(STATE *state, curandDistributionM2Shift_t distributionM2)
+{
+    return _curand_M2_double(curand(state), distributionM2);
+}
+template <typename STATE>
+QUALIFIERS uint4 curand_M2_double4(STATE *state, curandDistributionM2Shift_t distributionM2)
+{
+    return _curand_M2_double4(curand4(state), distributionM2);
+}
+template <typename T>
+QUALIFIERS unsigned int _curand_binary_search_double(T x, curandDistributionShift_t distribution)
+{
+    double u = _curand_uniform_double(x);
+    int min = 0;
+    int max = distribution->length-1;
+    do{
+        int mid = (max + min)/2;
+        double probability_mid;
+NV_IF_ELSE_TARGET(NV_PROVIDES_SM_35,
+        probability_mid = __ldg( &(distribution->probability[mid]));
+,
+        probability_mid = distribution->probability[mid];
+)
+        if (u <= probability_mid){
+            max = mid;
+        }else{
+            min = mid+1;
+        }
+    }while (min < max);
+    return distribution->shift + min;
+}
+template <typename STATE>
+QUALIFIERS unsigned int curand_binary_search_double(STATE *state, curandDistributionShift_t distribution)
+{
+    return _curand_binary_search_double(curand(state), distribution);
+}
+// Generates uniformly distributed double values in range (0.0; 1.0) from uniformly distributed
+// unsigned int. We can't use standard _curand_uniform_double since it can generate 1.0.
+// This is required only for _curand_poisson_ITR_double.
+QUALIFIERS double _curand_uniform_double_excluding_one(unsigned int x)
+{
+    return x * CURAND_2POW32_INV_DOUBLE + (CURAND_2POW32_INV_DOUBLE/2.0);
+}
+// Overload for unsigned long long.
+// This is required only for _curand_poisson_ITR_double.
+QUALIFIERS double _curand_uniform_double_excluding_one(unsigned long long x)
+{
+    return (x >> 11) * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/4.0);
+}
+#define MAGIC_DOUBLE_CONST 500.0
+template <typename T>
+//George S. Fishman Discrete-event simulation: modeling, programming, and analysis
+QUALIFIERS unsigned int _curand_poisson_ITR_double(T x, double lambda)
+{
+  double L,p = 1.0;
+  double q = 1.0;
+  unsigned int k = 0;
+  int pow=0;
+  // This algorithm requires u to be in (0;1) range, however, _curand_uniform_double
+  // returns a number in range (0;1]. If u is 1.0 the inner loop never ends. The
+  // following operation transforms the range from (0;1] to (0;1).
+  double u = _curand_uniform_double_excluding_one(x);
+  do{
+      if (lambda > (double)(pow+MAGIC_DOUBLE_CONST)){
+          L = exp(-MAGIC_DOUBLE_CONST);
+      }else{
+          L = exp((double)(pow - lambda));
+      }
+      p *= L;
+      q *= L;
+      pow += (int) MAGIC_DOUBLE_CONST;
+      while (u > q){
+        k++;
+        p *= ((double)lambda / (double) k);
+        q += p;
+      }
+  }while((double)pow < lambda);
+  return k;
+}
+template <typename T>
+/* Rejection Method for Poisson distribution based on gammainc approximation */
+QUALIFIERS unsigned int curand_poisson_gammainc(T state, float lambda){
+    float y, x, t, z,v;
+    float logl = __cr_log (lambda);
+    while (true) {
+        y = curand_uniform (state);
+        x = __cr_pgammaincinv (lambda, y);
+        x = floorf (x);
+        z = curand_uniform (state);
+        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
+        z = z*v;
+        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
+        if ((z < t) && (v>=1e-20))
+            break;
+    }
+    return (unsigned int)x;
+}
+template <typename T>
+/* Rejection Method for Poisson distribution based on gammainc approximation */
+QUALIFIERS uint4 curand_poisson_gammainc4(T state, float lambda){
+    uint4 result;
+    float y, x, t, z,v;
+    float logl = __cr_log (lambda);
+    while (true) {
+        y = curand_uniform(state);
+        x = __cr_pgammaincinv (lambda, y);
+        x = floorf (x);
+        z = curand_uniform (state);
+        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
+        z = z*v;
+        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
+        if ((z < t) && (v>=1e-20))
+            break;
+    }
+    result.x = (unsigned int)x;
+    while (true) {
+        y = curand_uniform(state);
+        x = __cr_pgammaincinv (lambda, y);
+        x = floorf (x);
+        z = curand_uniform (state);
+        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
+        z = z*v;
+        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
+        if ((z < t) && (v>=1e-20))
+            break;
+    }
+    result.y = (unsigned int)x;
+    while (true) {
+        y = curand_uniform(state);
+        x = __cr_pgammaincinv (lambda, y);
+        x = floorf (x);
+        z = curand_uniform (state);
+        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
+        z = z*v;
+        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
+        if ((z < t) && (v>=1e-20))
+            break;
+    }
+    result.z = (unsigned int)x;
+    while (true) {
+        y = curand_uniform(state);
+        x = __cr_pgammaincinv (lambda, y);
+        x = floorf (x);
+        z = curand_uniform (state);
+        v = (__cr_pgammainc (lambda, x + 1.0f) - __cr_pgammainc (lambda, x)) * 1.3f;
+        z = z*v;
+        t = (float)__cr_exp (-lambda + x * logl - (float)__cr_lgamma_integer ((int)(1.0f + x)));
+        if ((z < t) && (v>=1e-20))
+            break;
+    }
+    result.w = (unsigned int)x;
+    return result;
+}
+// Note below that the round to nearest integer, where needed,is done in line with code that
+// assumes the range of values is < 2**32
+template <typename T>
+QUALIFIERS unsigned int _curand_poisson(T x, double lambda)
+{
+    if (lambda < 1000)
+        return _curand_poisson_ITR_double(x, lambda);
+    return (unsigned int)((sqrt(lambda) * _curand_normal_icdf_double(x)) + lambda + 0.5); //Round to nearest
+}
+template <typename T>
+QUALIFIERS unsigned int _curand_poisson_from_normal(T x, double lambda)
+{
+    return (unsigned int)((sqrt(lambda) * _curand_normal_icdf(x)) + lambda + 0.5); //Round to nearest
+}
+template <typename STATE>
+QUALIFIERS unsigned int curand_poisson_from_normal(STATE state, double lambda)
+{
+    return (unsigned int)((sqrt(lambda) * curand_normal(state)) + lambda + 0.5); //Round to nearest
+}
+template <typename STATE>
+QUALIFIERS uint4 curand_poisson_from_normal4(STATE state, double lambda)
+{
+   uint4 result;
+   float4 _res;
+   _res = curand_normal4(state);
+   result.x = (unsigned int)((sqrt(lambda) * _res.x) + lambda + 0.5); //Round to nearest
+   result.y = (unsigned int)((sqrt(lambda) * _res.y) + lambda + 0.5); //Round to nearest
+   result.z = (unsigned int)((sqrt(lambda) * _res.z) + lambda + 0.5); //Round to nearest
+   result.w = (unsigned int)((sqrt(lambda) * _res.w) + lambda + 0.5); //Round to nearest
+   return result; //Round to nearest
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a XORWOW generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the XORWOW generator in \p state,
+ * increment the  position of the generator by a variable amount, depending
+ * on the algorithm used.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateXORWOW_t *state, double lambda)
+{
+    if (lambda < 64)
+        return curand_poisson_knuth(state, (float)lambda);
+    if (lambda > 4000)
+        return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
+    return curand_poisson_gammainc(state, (float)lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a Philox4_32_10 generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the Philox4_32_10 generator in \p state,
+ * increment the  position of the generator by a variable amount, depending
+ * on the algorithm used.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStatePhilox4_32_10_t *state, double lambda)
+{
+    if (lambda < 64)
+        return curand_poisson_knuth(state, (float)lambda);
+    if (lambda > 4000)
+        return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
+    return curand_poisson_gammainc(state, (float)lambda);
+}
+/**
+ * \brief Return four Poisson-distributed unsigned ints from a Philox4_32_10 generator.
+ *
+ * Return a four unsigned ints from a Poisson
+ * distribution with lambda \p lambda from the Philox4_32_10 generator in \p state,
+ * increment the  position of the generator by a variable amount, depending
+ * on the algorithm used.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS uint4 curand_poisson4(curandStatePhilox4_32_10_t *state, double lambda)
+{
+    uint4 result;
+    double4 _res;
+    if (lambda < 64)
+        return curand_poisson_knuth4(state, (float)lambda);
+    if (lambda > 4000) {
+        _res = curand_normal4_double(state);
+        result.x = (unsigned int)((sqrt(lambda) * _res.x) + lambda + 0.5); //Round to nearest
+        result.y = (unsigned int)((sqrt(lambda) * _res.y) + lambda + 0.5); //Round to nearest
+        result.z = (unsigned int)((sqrt(lambda) * _res.z) + lambda + 0.5); //Round to nearest
+        result.w = (unsigned int)((sqrt(lambda) * _res.w) + lambda + 0.5); //Round to nearest
+    	return result;
+    }
+    return curand_poisson_gammainc4(state, (float)lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a MRG32k3A generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the MRG32k3a generator in \p state,
+ * increment the position of the generator by a variable amount, depending
+ * on the algorithm used.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateMRG32k3a_t *state, double lambda)
+{
+    if (lambda < 64)
+        return curand_poisson_knuth(state, (float)lambda);
+    if (lambda > 4000)
+        return (unsigned int)((sqrt(lambda) * curand_normal_double(state)) + lambda + 0.5); //Round to nearest
+    return curand_poisson_gammainc(state, (float)lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a MTGP32 generator.
+ *
+ * Return a single int from a Poisson
+ * distribution with lambda \p lambda from the MTGP32 generator in \p state,
+ * increment the position of the generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateMtgp32_t *state, double lambda)
+{
+    return _curand_poisson(curand(state), lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a Sobol32 generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the Sobol32 generator in \p state,
+ * increment the position of the generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateSobol32_t *state, double lambda)
+{
+    return _curand_poisson(curand(state), lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a scrambled Sobol32 generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the scrambled Sobol32 generator in \p state,
+ * increment the position of the generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateScrambledSobol32_t *state, double lambda)
+{
+    return _curand_poisson(curand(state), lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a Sobol64 generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateSobol64_t *state, double lambda)
+{
+    return _curand_poisson(curand(state), lambda);
+}
+/**
+ * \brief Return a Poisson-distributed unsigned int from a scrambled Sobol64 generator.
+ *
+ * Return a single unsigned int from a Poisson
+ * distribution with lambda \p lambda from the scrambled Sobol64 generator in \p state,
+ * increment position of generator by one.
+ *
+ * \param state - Pointer to state to update
+ * \param lambda - Lambda of the Poisson distribution
+ *
+ * \return Poisson-distributed unsigned int with lambda \p lambda
+ */
+QUALIFIERS unsigned int curand_poisson(curandStateScrambledSobol64_t *state, double lambda)
+{
+    return _curand_poisson(curand(state), lambda);
+}
+#endif // !defined(CURAND_POISSON_H_)

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_precalc.h ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/nvidia/curand/include/curand_uniform.h ADDED Viewed

	@@ -0,0 +1,498 @@

+ /* Copyright 2010-2018 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#if !defined(CURAND_UNIFORM_H_)
+#define CURAND_UNIFORM_H_
+/**
+ * \defgroup DEVICE Device API
+ *
+ * @{
+ */
+#ifndef __CUDACC_RTC__
+#include <math.h>
+#endif // __CUDACC_RTC__
+#include "curand_mrg32k3a.h"
+#include "curand_mtgp32_kernel.h"
+#include "curand_philox4x32_x.h"
+QUALIFIERS float _curand_uniform(unsigned int x)
+{
+    return x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+}
+QUALIFIERS float4 _curand_uniform4(uint4 x)
+{
+    float4 y;
+    y.x = x.x * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+    y.y = x.y * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+    y.z = x.z * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+    y.w = x.w * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+    return y;
+}
+QUALIFIERS float _curand_uniform(unsigned long long x)
+{
+    unsigned int t;
+    t = (unsigned int)(x >> 32);
+    return t * CURAND_2POW32_INV + (CURAND_2POW32_INV/2.0f);
+}
+QUALIFIERS double _curand_uniform_double(unsigned int x)
+{
+    return x * CURAND_2POW32_INV_DOUBLE + CURAND_2POW32_INV_DOUBLE;
+}
+QUALIFIERS double _curand_uniform_double(unsigned long long x)
+{
+    return (x >> 11) * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
+}
+QUALIFIERS double _curand_uniform_double_hq(unsigned int x, unsigned int y)
+{
+    unsigned long long z = (unsigned long long)x ^
+        ((unsigned long long)y << (53 - 32));
+    return z * CURAND_2POW53_INV_DOUBLE + (CURAND_2POW53_INV_DOUBLE/2.0);
+}
+QUALIFIERS float curand_uniform(curandStateTest_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+QUALIFIERS double curand_uniform_double(curandStateTest_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed float from an XORWOW generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the XORWOW generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation may use any number of calls to \p curand() to
+ * get enough random bits to create the return value.  The current
+ * implementation uses one call.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateXORWOW_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from an XORWOW generator.
+ *
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
+ * from the XORWOW generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation may use any number of calls to \p curand() to
+ * get enough random bits to create the return value.  The current
+ * implementation uses exactly two calls.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double curand_uniform_double(curandStateXORWOW_t *state)
+{
+    unsigned int x, y;
+    x = curand(state);
+    y = curand(state);
+    return _curand_uniform_double_hq(x, y);
+}
+/**
+ * \brief Return a uniformly distributed float from an MRG32k3a generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the MRG32k3a generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation returns up to 23 bits of mantissa, with the minimum
+ * return value \f$ 2^{-32} \f$
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateMRG32k3a_t *state)
+{
+    return ((float)(curand_MRG32k3a(state)*MRG32K3A_NORM));
+}
+/**
+ * \brief Return a uniformly distributed double from an MRG32k3a generator.
+ *
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
+ * from the MRG32k3a generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * Note the implementation returns at most 32 random bits of mantissa as
+ * outlined in the seminal paper by L'Ecuyer.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double curand_uniform_double(curandStateMRG32k3a_t *state)
+{
+    return curand_MRG32k3a(state)*MRG32K3A_NORM;
+}
+/**
+ * \brief Return a uniformly distributed tuple of 2 doubles from an Philox4_32_10 generator.
+ *
+ * Return a uniformly distributed 2 doubles (double4) between \p 0.0 and \p 1.0
+ * from the Philox4_32_10 generator in \p state, increment position of generator by 4.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return 2 uniformly distributed doubles between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double2 curand_uniform2_double(curandStatePhilox4_32_10_t *state)
+{
+    uint4 _x;
+    double2 result;
+    _x = curand4(state);
+    result.x = _curand_uniform_double_hq(_x.x,_x.y);
+    result.y = _curand_uniform_double_hq(_x.z,_x.w);
+    return result;
+}
+// not a part of API
+QUALIFIERS double4 curand_uniform4_double(curandStatePhilox4_32_10_t *state)
+{
+    uint4 _x, _y;
+    double4 result;
+    _x = curand4(state);
+    _y = curand4(state);
+    result.x = _curand_uniform_double_hq(_x.x,_x.y);
+    result.y = _curand_uniform_double_hq(_x.z,_x.w);
+    result.z = _curand_uniform_double_hq(_y.x,_y.y);
+    result.w = _curand_uniform_double_hq(_y.z,_y.w);
+    return result;
+}
+/**
+ * \brief Return a uniformly distributed float from a Philox4_32_10 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the Philox4_32_10 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0 and \p 1.0
+ *
+ */
+QUALIFIERS float curand_uniform(curandStatePhilox4_32_10_t *state)
+{
+   return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed tuple of 4 floats from a Philox4_32_10 generator.
+ *
+ * Return a uniformly distributed 4 floats between \p 0.0f and \p 1.0f
+ * from the Philox4_32_10 generator in \p state, increment position of generator by 4.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0 and \p 1.0
+ *
+ */
+QUALIFIERS float4 curand_uniform4(curandStatePhilox4_32_10_t *state)
+{
+   return _curand_uniform4(curand4(state));
+}
+/**
+ * \brief Return a uniformly distributed float from a MTGP32 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the MTGP32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateMtgp32_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from a MTGP32 generator.
+ *
+ * Return a uniformly distributed double between \p 0.0f and \p 1.0f
+ * from the MTGP32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * Note that the implementation uses only 32 random bits to generate a single double
+ * precision value.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS double curand_uniform_double(curandStateMtgp32_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from a Philox4_32_10 generator.
+ *
+ * Return a uniformly distributed double between \p 0.0f and \p 1.0f
+ * from the Philox4_32_10 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * Note that the implementation uses only 32 random bits to generate a single double
+ * precision value.
+ *
+ * \p curand_uniform2_double() is recommended for higher quality uniformly distributed
+ * double precision values.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS double curand_uniform_double(curandStatePhilox4_32_10_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed float from a Sobol32 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the Sobol32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand().
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateSobol32_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from a Sobol32 generator.
+ *
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
+ * from the Sobol32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand()
+ * to preserve the quasirandom properties of the sequence.
+ *
+ * Note that the implementation uses only 32 random bits to generate a single double
+ * precision value.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double curand_uniform_double(curandStateSobol32_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed float from a scrambled Sobol32 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the scrambled Sobol32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand().
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateScrambledSobol32_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from a scrambled Sobol32 generator.
+ *
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
+ * from the scrambled Sobol32 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand()
+ * to preserve the quasirandom properties of the sequence.
+ *
+ * Note that the implementation uses only 32 random bits to generate a single double
+ * precision value.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double curand_uniform_double(curandStateScrambledSobol32_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed float from a Sobol64 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the Sobol64 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand().
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateSobol64_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from a Sobol64 generator.
+ *
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
+ * from the Sobol64 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand()
+ * to preserve the quasirandom properties of the sequence.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double curand_uniform_double(curandStateSobol64_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed float from a scrambled Sobol64 generator.
+ *
+ * Return a uniformly distributed float between \p 0.0f and \p 1.0f
+ * from the scrambled Sobol64 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0f but includes \p 1.0f.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand().
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed float between \p 0.0f and \p 1.0f
+ */
+QUALIFIERS float curand_uniform(curandStateScrambledSobol64_t *state)
+{
+    return _curand_uniform(curand(state));
+}
+/**
+ * \brief Return a uniformly distributed double from a scrambled Sobol64 generator.
+ *
+ * Return a uniformly distributed double between \p 0.0 and \p 1.0
+ * from the scrambled Sobol64 generator in \p state, increment position of generator.
+ * Output range excludes \p 0.0 but includes \p 1.0.  Denormalized floating
+ * point outputs are never returned.
+ *
+ * The implementation is guaranteed to use a single call to \p curand()
+ * to preserve the quasirandom properties of the sequence.
+ *
+ * \param state - Pointer to state to update
+ *
+ * \return uniformly distributed double between \p 0.0 and \p 1.0
+ */
+QUALIFIERS double curand_uniform_double(curandStateScrambledSobol64_t *state)
+{
+    return _curand_uniform_double(curand(state));
+}
+#endif // !defined(CURAND_UNIFORM_H_)

.venv/lib/python3.11/site-packages/nvidia/curand/lib/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/curand/lib/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (190 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/nvtx/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/nvtx/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (184 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (192 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExt.h ADDED Viewed

	@@ -0,0 +1,1561 @@

+/*
+* Copyright 2009-2017  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+/** \file nvToolsExt.h
+ */
+/* ========================================================================= */
+/** \mainpage
+ * \tableofcontents
+ * \section INTRODUCTION Introduction
+ *
+ * The NVIDIA Tools Extension library is a set of functions that a
+ * developer can use to provide additional information to tools.
+ * The additional information is used by the tool to improve
+ * analysis and visualization of data.
+ *
+ * The library introduces close to zero overhead if no tool is
+ * attached to the application.  The overhead when a tool is
+ * attached is specific to the tool.
+ *
+ * \section INITIALIZATION_SECTION Initialization
+ *
+ * Typically the tool's library that plugs into NVTX is indirectly
+ * loaded via enviromental properties that are platform specific.
+ * For some platform or special cases, the user may be required
+ * to instead explicity initialize instead though.   This can also
+ * be helpful to control when the API loads a tool's library instead
+ * of what would typically be the first function call to emit info.
+ * For these rare case, see \ref INITIALIZATION for additional information.
+ *
+ * \section MARKERS_AND_RANGES Markers and Ranges
+ *
+ * Markers and ranges are used to describe events at a specific time (markers)
+ * or over a time span (ranges) during the execution of the application
+ * respectively.
+ *
+ * \subsection MARKERS Markers
+ *
+ * Markers denote specific moments in time.
+ *
+ *
+ * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
+ * how to specify the domain.
+ *
+ * \subsection THREAD_RANGES Thread Ranges
+ *
+ * Thread ranges denote nested time ranges. Nesting is maintained per thread
+ * per domain and does not require any additional correlation mechanism. The
+ * duration of a thread range is defined by the corresponding pair of
+ * nvtxRangePush* to nvtxRangePop API calls.
+ *
+ * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
+ * how to specify the domain.
+ *
+ * \subsection PROCESS_RANGES Process Ranges
+ *
+ * Process ranges denote a time span that can expose arbitrary concurrency, as
+ * opposed to thread ranges that only support nesting. In addition the range
+ * start event can happen on a different thread than the end marker. For the
+ * correlation of a start/end pair an unique correlation ID is used that is
+ * returned from the start API call and needs to be passed into the end API
+ * call.
+ *
+ * \subsection EVENT_ATTRIBUTES Event Attributes
+ *
+ * \ref MARKERS_AND_RANGES can be annotated with various attributes to provide
+ * additional information for an event or to guide the tool's visualization of
+ * the data. Each of the attributes is optional and if left unused the
+ * attributes fall back to a default value. The attributes include:
+ * - color
+ * - category
+ *
+ * To specify any attribute other than the text message, the \ref
+ * EVENT_ATTRIBUTE_STRUCTURE "Event Attribute Structure" must be used.
+ *
+ * \section DOMAINS Domains
+ *
+ * Domains enable developers to scope annotations. By default all events and
+ * annotations are in the default domain. Additional domains can be registered.
+ * This allows developers to scope markers, ranges, and resources names to
+ * avoid conflicts.
+ *
+ * The function ::nvtxDomainCreateA or ::nvtxDomainCreateW is used to create
+ * a named domain.
+ *
+ * Each domain maintains its own
+ * - categories
+ * - thread range stacks
+ * - registered strings
+ *
+ * The function ::nvtxDomainDestroy marks the end of the domain. Destroying
+ * a domain unregisters and destroys all objects associated with it such as
+ * registered strings, resource objects, named categories, and started ranges.
+ *
+ * \section RESOURCE_NAMING Resource Naming
+ *
+ * This section covers calls that allow to annotate objects with user-provided
+ * names in order to allow for a better analysis of complex trace data. All of
+ * the functions take the handle or the ID of the object to name and the name.
+ * The functions can be called multiple times during the execution of an
+ * application, however, in that case it is implementation dependent which
+ * name will be reported by the tool.
+ *
+ * \subsection CATEGORY_NAMING Category Naming
+ *
+ * Some function in this library support associating an integer category
+ * to enable filtering and sorting.  The category naming functions allow
+ * the application to associate a user friendly name with the integer
+ * category.  Support for domains have been added in NVTX_VERSION_2 to
+ * avoid collisions when domains are developed independantly.
+ *
+ * \subsection RESOURCE_OBJECTS Resource Objects
+ *
+ * Resource objects are a generic mechanism for attaching data to an application
+ * resource.  The identifier field makes the association to a pointer or handle,
+ * while the type field helps provide deeper understanding of the identifier as
+ * well as enabling differentiation in cases where handles generated by different
+ * APIs may collide.  The resource object may also have an associated message to
+ * associate with the application resource, enabling further annotation of this
+ * object and how it is used.
+ *
+ * The resource object was introduced in NVTX_VERSION_2 to supersede existing naming
+ * functions and allow the application resource identified by those functions to be
+ * associated to a domain.  The other naming functions are still supported for backward
+ * compatibility but will be associated only to the default domain.
+ *
+ * \subsection RESOURCE_NAMING_OS Resource Naming
+ *
+ * Some operating system resources creation APIs do not support providing a user friendly
+ * name, such as some OS thread creation APIs.  This API support resource naming though
+ * both through resource objects and functions following the pattern
+ * nvtxName[RESOURCE_TYPE][A|W](identifier, name).  Resource objects introduced in NVTX_VERSION 2
+ * supersede the other functions with a a more general method of assigning names to OS resources,
+ * along with associating them to domains too.  The older nvtxName* functions are only associated
+ * with the default domain.
+ * \section EXTENSIONS Optional Extensions
+ * Optional extensions will either appear within the existing sections the extend or appear
+ * in the "Related Pages" when they introduce new concepts.
+ */
+#ifndef NVTOOLSEXT_H_
+#define NVTOOLSEXT_H_
+#if defined(_MSC_VER)
+    #ifdef NVTX_EXPORTS
+        #define NVTX_DECLSPEC
+    #else
+        #define NVTX_DECLSPEC __declspec(dllimport)
+    #endif /* NVTX_EXPORTS */
+    #define NVTX_API __stdcall
+    #define NVTX_INLINE_STATIC __inline static
+#else /*defined(__GNUC__)*/
+    #define NVTX_DECLSPEC
+    #define NVTX_API
+    #define NVTX_INLINE_STATIC inline static
+#endif /* Platform */
+/**
+ * The nvToolsExt library depends on stdint.h.  If the build tool chain in use
+ * does not include stdint.h then define NVTX_STDINT_TYPES_ALREADY_DEFINED
+ * and define the following types:
+ * <ul>
+ *   <li>uint8_t
+ *   <li>int8_t
+ *   <li>uint16_t
+ *   <li>int16_t
+ *   <li>uint32_t
+ *   <li>int32_t
+ *   <li>uint64_t
+ *   <li>int64_t
+ *   <li>uintptr_t
+ *   <li>intptr_t
+ * </ul>
+ #define NVTX_STDINT_TYPES_ALREADY_DEFINED if you are using your own header file.
+ */
+#ifndef NVTX_STDINT_TYPES_ALREADY_DEFINED
+#include <stdint.h>
+#endif
+#include <stddef.h>
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/**
+ * Tools Extension API version
+ */
+#define NVTX_VERSION 2
+/**
+ * Size of the nvtxEventAttributes_t structure.
+ */
+#define NVTX_EVENT_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxEventAttributes_t) ) )
+/**
+ * Size of the nvtxInitializationAttributes_t structure.
+ */
+#define NVTX_INITIALIZATION_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxInitializationAttributes_t) ) )
+#define NVTX_NO_PUSH_POP_TRACKING ((int)-2)
+typedef uint64_t nvtxRangeId_t;
+/* \brief String Handle Structure.
+* \anchor STRING_HANDLE_STRUCTURE
+*
+* This structure is opaque to the user and is used as a handle to reference
+* a string.  The tools will return a pointer through the API for the application
+* to hold on it's behalf to reference the string in the future.
+*
+*/
+typedef struct nvtxStringHandle* nvtxStringHandle_t;
+/* \brief Domain Handle Structure.
+* \anchor DOMAIN_HANDLE_STRUCTURE
+*
+* This structure is opaque to the user and is used as a handle to reference
+* a domain.  The tools will return a pointer through the API for the application
+* to hold on its behalf to reference the domain in the future.
+*
+*/
+typedef struct nvtxDomainHandle* nvtxDomainHandle_t;
+/* ========================================================================= */
+/** \defgroup GENERAL General
+ * @{
+ */
+/** ---------------------------------------------------------------------------
+ * Color Types
+ * ------------------------------------------------------------------------- */
+typedef enum nvtxColorType_t
+{
+    NVTX_COLOR_UNKNOWN  = 0,                 /**< Color attribute is unused. */
+    NVTX_COLOR_ARGB     = 1                  /**< An ARGB color is provided. */
+} nvtxColorType_t;
+/** ---------------------------------------------------------------------------
+ * Message Types
+ * ------------------------------------------------------------------------- */
+typedef enum nvtxMessageType_t
+{
+    NVTX_MESSAGE_UNKNOWN          = 0,    /**< Message payload is unused. */
+    NVTX_MESSAGE_TYPE_ASCII       = 1,    /**< A character sequence is used as payload. */
+    NVTX_MESSAGE_TYPE_UNICODE     = 2,     /**< A wide character sequence is used as payload. */
+    /* NVTX_VERSION_2 */
+    NVTX_MESSAGE_TYPE_REGISTERED  = 3     /**< A unique string handle that was registered
+                                                with \ref nvtxDomainRegisterStringA() or
+                                                \ref nvtxDomainRegisterStringW(). */
+} nvtxMessageType_t;
+typedef union nvtxMessageValue_t
+{
+    const char* ascii;
+    const wchar_t* unicode;
+    /* NVTX_VERSION_2 */
+    nvtxStringHandle_t registered;
+} nvtxMessageValue_t;
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup INITIALIZATION Initialization
+* @{
+* Typically the tool's library that plugs into NVTX is indirectly
+* loaded via enviromental properties that are platform specific.
+* For some platform or special cases, the user may be required
+* to instead explicity initialize instead though.  This can also
+* be helpful to control when the API loads a tool's library instead
+* of what would typically be the first function call to emit info.
+*/
+/** ---------------------------------------------------------------------------
+* Initialization Modes
+* ------------------------------------------------------------------------- */
+typedef enum nvtxInitializationMode_t
+{
+    NVTX_INITIALIZATION_MODE_UNKNOWN = 0,   /**< A platform that supports indirect initialization will attempt this style, otherwise expect failure. */
+    NVTX_INITIALIZATION_MODE_CALLBACK_V1 = 1,   /**< A function pointer conforming to NVTX_VERSION=1 will be used. */
+    NVTX_INITIALIZATION_MODE_CALLBACK_V2 = 2,   /**< A function pointer conforming to NVTX_VERSION=2 will be used. */
+    NVTX_INITIALIZATION_MODE_SIZE
+} nvtxInitializationMode_t;
+/** \brief Initialization Attribute Structure.
+* \anchor INITIALIZATION_ATTRIBUTE_STRUCTURE
+*
+* This structure is used to describe the attributes used for initialization
+* of the NVTX API.
+*
+* \par Initializing the Attributes
+*
+* The caller should always perform the following three tasks when using
+* attributes:
+* <ul>
+*    <li>Zero the structure
+*    <li>Set the version field
+*    <li>Set the size field
+* </ul>
+*
+* Zeroing the structure sets all the event attributes types and values
+* to the default value.
+*
+* The version and size field are used by the Tools Extension
+* implementation to handle multiple versions of the attributes structure.
+* NVTX_INITIALIZATION_ATTRIB_STRUCT_SIZE may be used for the size.
+*
+* It is recommended that the caller use one of the following to methods
+* to initialize the event attributes structure:
+*
+* \par Method 1: Initializing nvtxInitializationAttributes_t for future compatibility
+* \code
+* nvtxInitializationAttributes_t initAttribs = {0};
+* initAttribs.version = NVTX_VERSION;
+* initAttribs.size = NVTX_INITIALIZATION_ATTRIB_STRUCT_SIZE;
+* \endcode
+*
+* \par Method 2: Initializing nvtxInitializationAttributes_t for a specific version
+* \code
+* nvtxInitializationAttributes_t initAttribs = {0};
+* initAttribs.version =2;
+* initAttribs.size = (uint16_t)(sizeof(nvtxInitializationAttributes_v2));
+* \endcode
+*
+* If the caller uses Method 1 it is critical that the entire binary
+* layout of the structure be configured to 0 so that all fields
+* are initialized to the default value.
+*
+* The caller should either use both NVTX_VERSION and
+* NVTX_INITIALIZATION_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
+* and a versioned type (Method 2).  Using a mix of the two methods
+* will likely cause either source level incompatibility or binary
+* incompatibility in the future.
+*
+* \par Settings Attribute Types and Values
+*
+*
+* \par Example:
+* \code
+* // Initialize
+* nvtxInitializationAttributes_t initAttribs = {0};
+* initAttribs.version = NVTX_VERSION;
+* initAttribs.size = NVTX_INITIALIZATION_ATTRIB_STRUCT_SIZE;
+*
+* // Configure the Attributes
+* initAttribs.mode = NVTX_INITIALIZATION_MODE_CALLBACK_V2;
+* initAttribs.fnptr = InitializeInjectionNvtx2;
+* \endcode
+* \sa
+* ::nvtxInitializationMode_t
+* ::nvtxInitialize
+*/
+typedef struct nvtxInitializationAttributes_v2
+{
+    /**
+    * \brief Version flag of the structure.
+    *
+    * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
+    * supported in this header file. This can optionally be overridden to
+    * another version of the tools extension library.
+    */
+    uint16_t version;
+    /**
+    * \brief Size of the structure.
+    *
+    * Needs to be set to the size in bytes of the event attribute
+    * structure used to specify the event.
+    */
+    uint16_t size;
+    /**
+    * \brief Mode of initialization.
+    *
+    * The mode of initialization dictates the overall behavior and which
+    * attributes in this struct will be used.
+    *
+    * Default Value is NVTX_INITIALIZATION_MODE_UNKNOWN = 0
+    * \sa
+    * ::nvtxInitializationMode_t
+    */
+    uint32_t mode;
+    /**
+    * \brief Function pointer used for initialization if the mode requires
+    *
+    * The user has retrieved this function pointer from the tool library
+    * and would like to use it to initialize.  The mode must be set to a
+    * NVTX_INITIALIZATION_MODE_CALLBACK_V# for this to be used.  The mode
+    * will dictate the expectations for this member.  The function signature
+    * will be cast from void(*)() to the appropriate signature for the mode.
+    * the expected behavior of the function will also depend on the mode
+    * beyond the simple function signature.
+    *
+    * Default Value is NVTX_INITIALIZATION_MODE_UNKNOWN which will either
+    * initialize based on external properties or fail if not supported on
+    * the given platform.
+    * \sa
+    * ::nvtxInitializationMode_t
+    */
+    void(*fnptr)(void);
+} nvtxInitializationAttributes_v2;
+typedef struct nvtxInitializationAttributes_v2 nvtxInitializationAttributes_t;
+/* ------------------------------------------------------------------------- */
+/** \brief Force initialization (optional on most platforms)
+*
+* Force NVTX library to initialize.  On some platform NVTX will implicit initialize
+* upon the first function call into an NVTX API.
+*
+* \return Result codes are simplest to assume NVTX_SUCCESS or !NVTX_SUCCESS
+*
+* \param initAttrib - The initialization attribute structure
+*
+* \sa
+* ::nvtxInitializationAttributes_t
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC int NVTX_API nvtxInitialize(const nvtxInitializationAttributes_t* initAttrib);
+/** @} */
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup EVENT_ATTRIBUTES Event Attributes
+* @{
+*/
+/** ---------------------------------------------------------------------------
+* Payload Types
+* ------------------------------------------------------------------------- */
+typedef enum nvtxPayloadType_t
+{
+    NVTX_PAYLOAD_UNKNOWN = 0,   /**< Color payload is unused. */
+    NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 = 1,   /**< A 64 bit unsigned integer value is used as payload. */
+    NVTX_PAYLOAD_TYPE_INT64 = 2,   /**< A 64 bit signed integer value is used as payload. */
+    NVTX_PAYLOAD_TYPE_DOUBLE = 3,   /**< A 64 bit floating point value is used as payload. */
+    /* NVTX_VERSION_2 */
+    NVTX_PAYLOAD_TYPE_UNSIGNED_INT32 = 4,   /**< A 32 bit floating point value is used as payload. */
+    NVTX_PAYLOAD_TYPE_INT32 = 5,   /**< A 32 bit floating point value is used as payload. */
+    NVTX_PAYLOAD_TYPE_FLOAT = 6    /**< A 32 bit floating point value is used as payload. */
+} nvtxPayloadType_t;
+/** \brief Event Attribute Structure.
+ * \anchor EVENT_ATTRIBUTE_STRUCTURE
+ *
+ * This structure is used to describe the attributes of an event. The layout of
+ * the structure is defined by a specific version of the tools extension
+ * library and can change between different versions of the Tools Extension
+ * library.
+ *
+ * \par Initializing the Attributes
+ *
+ * The caller should always perform the following three tasks when using
+ * attributes:
+ * <ul>
+ *    <li>Zero the structure
+ *    <li>Set the version field
+ *    <li>Set the size field
+ * </ul>
+ *
+ * Zeroing the structure sets all the event attributes types and values
+ * to the default value.
+ *
+ * The version and size field are used by the Tools Extension
+ * implementation to handle multiple versions of the attributes structure.
+ *
+ * It is recommended that the caller use one of the following to methods
+ * to initialize the event attributes structure:
+ *
+ * \par Method 1: Initializing nvtxEventAttributes for future compatibility
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * \endcode
+ *
+ * \par Method 2: Initializing nvtxEventAttributes for a specific version
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = 1;
+ * eventAttrib.size = (uint16_t)(sizeof(nvtxEventAttributes_v1));
+ * \endcode
+ *
+ * If the caller uses Method 1 it is critical that the entire binary
+ * layout of the structure be configured to 0 so that all fields
+ * are initialized to the default value.
+ *
+ * The caller should either use both NVTX_VERSION and
+ * NVTX_EVENT_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
+ * and a versioned type (Method 2).  Using a mix of the two methods
+ * will likely cause either source level incompatibility or binary
+ * incompatibility in the future.
+ *
+ * \par Settings Attribute Types and Values
+ *
+ *
+ * \par Example:
+ * \code
+ * // Initialize
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ *
+ * // Configure the Attributes
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFF880000;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Example";
+ * \endcode
+ *
+ * In the example the caller does not have to set the value of
+ * \ref ::nvtxEventAttributes_v2::category or
+ * \ref ::nvtxEventAttributes_v2::payload as these fields were set to
+ * the default value by {0}.
+ * \sa
+ * ::nvtxDomainMarkEx
+ * ::nvtxDomainRangeStartEx
+ * ::nvtxDomainRangePushEx
+ */
+typedef struct nvtxEventAttributes_v2
+{
+    /**
+    * \brief Version flag of the structure.
+    *
+    * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
+    * supported in this header file. This can optionally be overridden to
+    * another version of the tools extension library.
+    */
+    uint16_t version;
+    /**
+    * \brief Size of the structure.
+    *
+    * Needs to be set to the size in bytes of the event attribute
+    * structure used to specify the event.
+    */
+    uint16_t size;
+    /**
+     * \brief ID of the category the event is assigned to.
+     *
+     * A category is a user-controlled ID that can be used to group
+     * events.  The tool may use category IDs to improve filtering or
+     * enable grouping of events in the same category. The functions
+     * \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used
+     * to name a category.
+     *
+     * Default Value is 0
+     */
+    uint32_t category;
+    /** \brief Color type specified in this attribute structure.
+     *
+     * Defines the color format of the attribute structure's \ref COLOR_FIELD
+     * "color" field.
+     *
+     * Default Value is NVTX_COLOR_UNKNOWN
+     */
+    int32_t colorType;              /* nvtxColorType_t */
+    /** \brief Color assigned to this event. \anchor COLOR_FIELD
+     *
+     * The color that the tool should use to visualize the event.
+     */
+    uint32_t color;
+    /**
+     * \brief Payload type specified in this attribute structure.
+     *
+     * Defines the payload format of the attribute structure's \ref PAYLOAD_FIELD
+     * "payload" field.
+     *
+     * Default Value is NVTX_PAYLOAD_UNKNOWN
+     */
+    int32_t payloadType;            /* nvtxPayloadType_t */
+    int32_t reserved0;
+    /**
+     * \brief Payload assigned to this event. \anchor PAYLOAD_FIELD
+     *
+     * A numerical value that can be used to annotate an event. The tool could
+     * use the payload data to reconstruct graphs and diagrams.
+     */
+    union payload_t
+    {
+        uint64_t ullValue;
+        int64_t llValue;
+        double dValue;
+        /* NVTX_VERSION_2 */
+        uint32_t uiValue;
+        int32_t iValue;
+        float fValue;
+    } payload;
+    /** \brief Message type specified in this attribute structure.
+     *
+     * Defines the message format of the attribute structure's \ref MESSAGE_FIELD
+     * "message" field.
+     *
+     * Default Value is NVTX_MESSAGE_UNKNOWN
+     */
+    int32_t messageType;            /* nvtxMessageType_t */
+    /** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD
+     *
+     * The text message that is attached to an event.
+     */
+    nvtxMessageValue_t message;
+} nvtxEventAttributes_v2;
+typedef struct nvtxEventAttributes_v2 nvtxEventAttributes_t;
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup MARKERS_AND_RANGES Markers and Ranges
+ *
+ * See \ref MARKERS_AND_RANGES for more details
+ *
+ * @{
+ */
+/** \name Marker */
+/* ------------------------------------------------------------------------- */
+/** \brief Marks an instantaneous event in the application.
+*
+* A marker can contain a text message or specify additional information
+* using the event attributes structure.  These attributes include a text
+* message, color, category, and a payload. Each of the attributes is optional
+* and can only be sent out using the \ref nvtxDomainMarkEx function.
+*
+* nvtxDomainMarkEx(NULL, event) is equivalent to calling
+* nvtxMarkEx(event).
+*
+* \param domain    - The domain of scoping the category.
+* \param eventAttrib - The event attribute structure defining the marker's
+* attribute types and attribute values.
+*
+* \sa
+* ::nvtxMarkEx
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Marks an instantaneous event in the application.
+ *
+ * A marker can contain a text message or specify additional information
+ * using the event attributes structure.  These attributes include a text
+ * message, color, category, and a payload. Each of the attributes is optional
+ * and can only be sent out using the \ref nvtxMarkEx function.
+ * If \ref nvtxMarkA or \ref nvtxMarkW are used to specify the marker
+ * or if an attribute is unspecified then a default value will be used.
+ *
+ * \param eventAttrib - The event attribute structure defining the marker's
+ * attribute types and attribute values.
+ *
+ * \par Example:
+ * \code
+ * // zero the structure
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * // set the version and the size information
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * // configure the attributes.  0 is the default for all attributes.
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFF880000;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Example nvtxMarkEx";
+ * nvtxMarkEx(&eventAttrib);
+ * \endcode
+ *
+ * \sa
+ * ::nvtxDomainMarkEx
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Marks an instantaneous event in the application.
+ *
+ * A marker created using \ref nvtxMarkA or \ref nvtxMarkW contains only a
+ * text message.
+ *
+ * \param message     - The message associated to this marker event.
+ *
+ * \par Example:
+ * \code
+ * nvtxMarkA("Example nvtxMarkA");
+ * nvtxMarkW(L"Example nvtxMarkW");
+ * \endcode
+ *
+ * \sa
+ * ::nvtxDomainMarkEx
+ * ::nvtxMarkEx
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message);
+NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message);
+/** @} */
+/** \name Process Ranges */
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a process range in a domain.
+*
+* \param domain    - The domain of scoping the category.
+* \param eventAttrib - The event attribute structure defining the range's
+* attribute types and attribute values.
+*
+* \return The unique ID used to correlate a pair of Start and End events.
+*
+* \remarks Ranges defined by Start/End can overlap.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain");
+* nvtxEventAttributes_t eventAttrib = {0};
+* eventAttrib.version = NVTX_VERSION;
+* eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* eventAttrib.message.ascii = "my range";
+* nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib);
+* // ...
+* nvtxDomainRangeEnd(rangeId);
+* \endcode
+*
+* \sa
+* ::nvtxDomainRangeEnd
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a process range.
+ *
+ * \param eventAttrib - The event attribute structure defining the range's
+ * attribute types and attribute values.
+ *
+ * \return The unique ID used to correlate a pair of Start and End events.
+ *
+ * \remarks Ranges defined by Start/End can overlap.
+ *
+ * \par Example:
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * eventAttrib.category = 3;
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFF0088FF;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Example Range";
+ * nvtxRangeId_t rangeId = nvtxRangeStartEx(&eventAttrib);
+ * // ...
+ * nvtxRangeEnd(rangeId);
+ * \endcode
+ *
+ * \sa
+ * ::nvtxRangeEnd
+ * ::nvtxDomainRangeStartEx
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a process range.
+ *
+ * \param message     - The event message associated to this range event.
+ *
+ * \return The unique ID used to correlate a pair of Start and End events.
+ *
+ * \remarks Ranges defined by Start/End can overlap.
+ *
+ * \par Example:
+ * \code
+ * nvtxRangeId_t r1 = nvtxRangeStartA("Range 1");
+ * nvtxRangeId_t r2 = nvtxRangeStartW(L"Range 2");
+ * nvtxRangeEnd(r1);
+ * nvtxRangeEnd(r2);
+ * \endcode
+ *
+ * \sa
+ * ::nvtxRangeEnd
+ * ::nvtxRangeStartEx
+ * ::nvtxDomainRangeStartEx
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message);
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Ends a process range.
+*
+* \param domain - The domain
+* \param id - The correlation ID returned from a nvtxRangeStart call.
+*
+* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd.
+* It does not need a domain param since that is associated iwth the range ID at ::nvtxDomainRangeStartEx
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain");
+* nvtxEventAttributes_t eventAttrib = {0};
+* eventAttrib.version = NVTX_VERSION;
+* eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* eventAttrib.message.ascii = "my range";
+* nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib);
+* // ...
+* nvtxDomainRangeEnd(rangeId);
+* \endcode
+*
+* \sa
+* ::nvtxDomainRangeStartEx
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Ends a process range.
+ *
+ * \param id - The correlation ID returned from an nvtxRangeStart call.
+ *
+ * \sa
+ * ::nvtxDomainRangeStartEx
+ * ::nvtxRangeStartEx
+ * ::nvtxRangeStartA
+ * ::nvtxRangeStartW
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id);
+/** @} */
+/** \name Thread Ranges */
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a nested thread range.
+*
+* \param domain    - The domain of scoping.
+* \param eventAttrib - The event attribute structure defining the range's
+* attribute types and attribute values.
+*
+* \return The 0 based level of range being started. This value is scoped to the domain.
+* If an error occurs, a negative value is returned.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
+* nvtxEventAttributes_t eventAttrib = {0};
+* eventAttrib.version = NVTX_VERSION;
+* eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib.colorType = NVTX_COLOR_ARGB;
+* eventAttrib.color = 0xFFFF0000;
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* eventAttrib.message.ascii = "Level 0";
+* nvtxDomainRangePushEx(domain, &eventAttrib);
+*
+* // Re-use eventAttrib
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE;
+* eventAttrib.message.unicode = L"Level 1";
+* nvtxDomainRangePushEx(domain, &eventAttrib);
+*
+* nvtxDomainRangePop(domain); //level 1
+* nvtxDomainRangePop(domain); //level 0
+* \endcode
+*
+* \sa
+* ::nvtxDomainRangePop
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a nested thread range.
+ *
+ * \param eventAttrib - The event attribute structure defining the range's
+ * attribute types and attribute values.
+ *
+ * \return The 0 based level of range being started. This level is per domain.
+ * If an error occurs a negative value is returned.
+ *
+ * \par Example:
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFFFF0000;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Level 0";
+ * nvtxRangePushEx(&eventAttrib);
+ *
+ * // Re-use eventAttrib
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE;
+ * eventAttrib.message.unicode = L"Level 1";
+ * nvtxRangePushEx(&eventAttrib);
+ *
+ * nvtxRangePop();
+ * nvtxRangePop();
+ * \endcode
+ *
+ * \sa
+ * ::nvtxDomainRangePushEx
+ * ::nvtxRangePop
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a nested thread range.
+ *
+ * \param message     - The event message associated to this range event.
+ *
+ * \return The 0 based level of range being started.  If an error occurs a
+ * negative value is returned.
+ *
+ * \par Example:
+ * \code
+ * nvtxRangePushA("Level 0");
+ * nvtxRangePushW(L"Level 1");
+ * nvtxRangePop();
+ * nvtxRangePop();
+ * \endcode
+ *
+ * \sa
+ * ::nvtxDomainRangePushEx
+ * ::nvtxRangePop
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message);
+NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Ends a nested thread range.
+*
+* \return The level of the range being ended. If an error occurs a negative
+* value is returned on the current thread.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreate("example library");
+* nvtxDomainRangePushA(domain, "Level 0");
+* nvtxDomainRangePushW(domain, L"Level 1");
+* nvtxDomainRangePop(domain);
+* nvtxDomainRangePop(domain);
+* \endcode
+*
+* \sa
+* ::nvtxRangePushEx
+* ::nvtxRangePushA
+* ::nvtxRangePushW
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Ends a nested thread range.
+ *
+ * \return The level of the range being ended. If an error occurs a negative
+ * value is returned on the current thread.
+ *
+ * \par Example:
+ * \code
+ * nvtxRangePushA("Level 0");
+ * nvtxRangePushW(L"Level 1");
+ * nvtxRangePop();
+ * nvtxRangePop();
+ * \endcode
+ *
+ * \sa
+ * ::nvtxRangePushEx
+ * ::nvtxRangePushA
+ * ::nvtxRangePushW
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePop(void);
+/** @} */
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup RESOURCE_NAMING Resource Naming
+ *
+ * See \ref RESOURCE_NAMING for more details
+ *
+ * @{
+ */
+/*  ------------------------------------------------------------------------- */
+/** \name Functions for Generic Resource Naming*/
+/*  ------------------------------------------------------------------------- */
+/*  ------------------------------------------------------------------------- */
+/** \cond SHOW_HIDDEN
+* \brief Resource typing helpers.
+*
+* Classes are used to make it easy to create a series of resource types
+* per API without collisions
+*/
+#define NVTX_RESOURCE_MAKE_TYPE(CLASS, INDEX) ((((uint32_t)(NVTX_RESOURCE_CLASS_ ## CLASS))<<16)|((uint32_t)(INDEX)))
+#define NVTX_RESOURCE_CLASS_GENERIC 1
+/** \endcond */
+/* ------------------------------------------------------------------------- */
+/** \brief Generic resource type for when a resource class is not available.
+*
+* \sa
+* ::nvtxDomainResourceCreate
+*
+* \version \NVTX_VERSION_2
+*/
+typedef enum nvtxResourceGenericType_t
+{
+    NVTX_RESOURCE_TYPE_UNKNOWN = 0,
+    NVTX_RESOURCE_TYPE_GENERIC_POINTER = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 1), /**< Generic pointer assumed to have no collisions with other pointers. */
+    NVTX_RESOURCE_TYPE_GENERIC_HANDLE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 2), /**< Generic handle assumed to have no collisions with other handles. */
+    NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 3), /**< OS native thread identifier. */
+    NVTX_RESOURCE_TYPE_GENERIC_THREAD_POSIX = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 4) /**< POSIX pthread identifier. */
+} nvtxResourceGenericType_t;
+/** \brief Resource Attribute Structure.
+* \anchor RESOURCE_ATTRIBUTE_STRUCTURE
+*
+* This structure is used to describe the attributes of a resource. The layout of
+* the structure is defined by a specific version of the tools extension
+* library and can change between different versions of the Tools Extension
+* library.
+*
+* \par Initializing the Attributes
+*
+* The caller should always perform the following three tasks when using
+* attributes:
+* <ul>
+*    <li>Zero the structure
+*    <li>Set the version field
+*    <li>Set the size field
+* </ul>
+*
+* Zeroing the structure sets all the resource attributes types and values
+* to the default value.
+*
+* The version and size field are used by the Tools Extension
+* implementation to handle multiple versions of the attributes structure.
+*
+* It is recommended that the caller use one of the following to methods
+* to initialize the event attributes structure:
+*
+* \par Method 1: Initializing nvtxEventAttributes for future compatibility
+* \code
+* nvtxResourceAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
+* \endcode
+*
+* \par Method 2: Initializing nvtxEventAttributes for a specific version
+* \code
+* nvtxResourceAttributes_v0 attribs = {0};
+* attribs.version = 2;
+* attribs.size = (uint16_t)(sizeof(nvtxResourceAttributes_v0));
+* \endcode
+*
+* If the caller uses Method 1 it is critical that the entire binary
+* layout of the structure be configured to 0 so that all fields
+* are initialized to the default value.
+*
+* The caller should either use both NVTX_VERSION and
+* NVTX_RESOURCE_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
+* and a versioned type (Method 2).  Using a mix of the two methods
+* will likely cause either source level incompatibility or binary
+* incompatibility in the future.
+*
+* \par Settings Attribute Types and Values
+*
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
+*
+* // Initialize
+* nvtxResourceAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
+*
+* // Configure the Attributes
+* attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
+* attribs.identifier.pValue = (const void*)pMutex;
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Single thread access to database.";
+*
+* nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
+* \endcode
+*
+* \sa
+* ::nvtxDomainResourceCreate
+*/
+typedef struct nvtxResourceAttributes_v0
+{
+    /**
+    * \brief Version flag of the structure.
+    *
+    * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
+    * supported in this header file. This can optionally be overridden to
+    * another version of the tools extension library.
+    */
+    uint16_t version;
+    /**
+    * \brief Size of the structure.
+    *
+    * Needs to be set to the size in bytes of this attribute
+    * structure.
+    */
+    uint16_t size;
+    /**
+    * \brief Identifier type specifies how to interpret the identifier field
+    *
+    * Defines the identifier format of the attribute structure's \ref RESOURCE_IDENTIFIER_FIELD
+    * "identifier" field.
+    *
+    * Default Value is NVTX_RESOURCE_TYPE_UNKNOWN
+    */
+    int32_t identifierType;            /* values from enums following the pattern nvtxResource[name]Type_t */
+    /**
+    * \brief Identifier for the resource.
+    * \anchor RESOURCE_IDENTIFIER_FIELD
+    *
+    * An identifier may be a pointer or a handle to an OS or middleware API object.
+    * The resource type will assist in avoiding collisions where handles values may collide.
+    */
+    union identifier_t
+    {
+        const void* pValue;
+        uint64_t ullValue;
+    } identifier;
+    /** \brief Message type specified in this attribute structure.
+    *
+    * Defines the message format of the attribute structure's \ref RESOURCE_MESSAGE_FIELD
+    * "message" field.
+    *
+    * Default Value is NVTX_MESSAGE_UNKNOWN
+    */
+    int32_t messageType;            /* nvtxMessageType_t */
+    /** \brief Message assigned to this attribute structure. \anchor RESOURCE_MESSAGE_FIELD
+    *
+    * The text message that is attached to a resource.
+    */
+    nvtxMessageValue_t message;
+} nvtxResourceAttributes_v0;
+typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t;
+/* \cond SHOW_HIDDEN
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxResourceAttributes_v0) ) )
+typedef struct nvtxResourceHandle* nvtxResourceHandle_t;
+/** \endcond */
+/* ------------------------------------------------------------------------- */
+/** \brief Create a resource object to track and associate data with OS and middleware objects
+*
+* Allows users to associate an API handle or pointer with a user-provided name.
+*
+*
+* \param domain - Domain to own the resource object
+* \param attribs - Attributes to be associated with the resource
+*
+* \return A handle that represents the newly created resource object.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
+* nvtxResourceAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
+* attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
+* attribs.identifier.pValue = (const void*)pMutex;
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Single thread access to database.";
+* nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
+* \endcode
+*
+* \sa
+* ::nvtxResourceAttributes_t
+* ::nvtxDomainResourceDestroy
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Destroy a resource object to track and associate data with OS and middleware objects
+*
+* Allows users to associate an API handle or pointer with a user-provided name.
+*
+* \param resource - Handle to the resource in which to operate.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
+* nvtxResourceAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
+* attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
+* attribs.identifier.pValue = (const void*)pMutex;
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Single thread access to database.";
+* nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
+* nvtxDomainResourceDestroy(handle);
+* \endcode
+*
+* \sa
+* ::nvtxDomainResourceCreate
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource);
+/** @} */
+/** \name Functions for NVTX Category Naming*/
+/* ------------------------------------------------------------------------- */
+/**
+* \brief Annotate an NVTX category used within a domain.
+*
+* Categories are used to group sets of events. Each category is identified
+* through a unique ID and that ID is passed into any of the marker/range
+* events to assign that event to a specific category. The nvtxDomainNameCategory
+* function calls allow the user to assign a name to a category ID that is
+* specific to the domain.
+*
+* nvtxDomainNameCategory(NULL, category, name) is equivalent to calling
+* nvtxNameCategory(category, name).
+*
+* \param domain    - The domain of scoping the category.
+* \param category  - The category ID to name.
+* \param name      - The name of the category.
+*
+* \remarks The category names are tracked per domain.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example");
+* nvtxDomainNameCategoryA(domain, 1, "Memory Allocation");
+* nvtxDomainNameCategoryW(domain, 2, L"Memory Transfer");
+* \endcode
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
+/** @} */
+/** \brief Annotate an NVTX category.
+ *
+ * Categories are used to group sets of events. Each category is identified
+ * through a unique ID and that ID is passed into any of the marker/range
+ * events to assign that event to a specific category. The nvtxNameCategory
+ * function calls allow the user to assign a name to a category ID.
+ *
+ * \param category - The category ID to name.
+ * \param name     - The name of the category.
+ *
+ * \remarks The category names are tracked per process.
+ *
+ * \par Example:
+ * \code
+ * nvtxNameCategory(1, "Memory Allocation");
+ * nvtxNameCategory(2, "Memory Transfer");
+ * nvtxNameCategory(3, "Memory Object Lifetime");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name);
+/** @} */
+/** \name Functions for OS Threads Naming*/
+/* ------------------------------------------------------------------------- */
+/** \brief Annotate an OS thread.
+ *
+ * Allows the user to name an active thread of the current process. If an
+ * invalid thread ID is provided or a thread ID from a different process is
+ * used the behavior of the tool is implementation dependent.
+ *
+ * The thread name is associated to the default domain.  To support domains
+ * use resource objects via ::nvtxDomainResourceCreate.
+ *
+ * \param threadId - The ID of the thread to name.
+ * \param name     - The name of the thread.
+ *
+ * \par Example:
+ * \code
+ * nvtxNameOsThread(GetCurrentThreadId(), "MAIN_THREAD");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name);
+/** @} */
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup STRING_REGISTRATION String Registration
+*
+* Registered strings are intended to increase performance by lowering instrumentation
+* overhead.  String may be registered once and the handle may be passed in place of
+* a string where an the APIs may allow.
+*
+* See \ref STRING_REGISTRATION for more details
+*
+* @{
+*/
+/* ------------------------------------------------------------------------- */
+/** \brief Register a string.
+* Registers an immutable string with NVTX. Once registered the pointer used
+* to register the domain name can be used in nvtxEventAttributes_t
+* \ref MESSAGE_FIELD. This allows NVTX implementation to skip copying the
+* contents of the message on each event invocation.
+*
+* String registration is an optimization. It is recommended to use string
+* registration if the string will be passed to an event many times.
+*
+* String are not unregistered, except that by unregistering the entire domain
+*
+* \param domain  - Domain handle. If NULL then the global domain is used.
+* \param string    - A unique pointer to a sequence of characters.
+*
+* \return A handle representing the registered string.
+*
+* \par Example:
+* \code
+* nvtxDomainCreateA("com.nvidia.nvtx.example");
+* nvtxStringHandle_t message = nvtxDomainRegisterStringA(domain, "registered string");
+* nvtxEventAttributes_t eventAttrib = {0};
+* eventAttrib.version = NVTX_VERSION;
+* eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
+* eventAttrib.message.registered = message;
+* \endcode
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string);
+NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string);
+/** @} */
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup DOMAINS Domains
+*
+* Domains are used to group events to a developer defined scope. Middleware
+* vendors may also scope their own events to avoid collisions with the
+* the application developer's events, so that the application developer may
+* inspect both parts and easily differentiate or filter them.  By default
+* all events are scoped to a global domain where NULL is provided or when
+* using APIs provided b versions of NVTX below v2
+*
+* Domains are intended to be typically long lived objects with the intention
+* of logically separating events of large modules from each other such as
+* middleware libraries from each other and the main application.
+*
+* See \ref DOMAINS for more details
+*
+* @{
+*/
+/* ------------------------------------------------------------------------- */
+/** \brief Register a NVTX domain.
+*
+* Domains are used to scope annotations. All NVTX_VERSION_0 and NVTX_VERSION_1
+* annotations are scoped to the global domain. The function nvtxDomainCreate
+* creates a new named domain.
+*
+* Each domain maintains its own nvtxRangePush and nvtxRangePop stack.
+*
+* \param name - A unique string representing the domain.
+*
+* \return A handle representing the domain.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example");
+*
+* nvtxMarkA("nvtxMarkA to global domain");
+*
+* nvtxEventAttributes_t eventAttrib1 = {0};
+* eventAttrib1.version = NVTX_VERSION;
+* eventAttrib1.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib1.message.ascii = "nvtxDomainMarkEx to global domain";
+* nvtxDomainMarkEx(NULL, &eventAttrib1);
+*
+* nvtxEventAttributes_t eventAttrib2 = {0};
+* eventAttrib2.version = NVTX_VERSION;
+* eventAttrib2.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib2.message.ascii = "nvtxDomainMarkEx to com.nvidia.nvtx.example";
+* nvtxDomainMarkEx(domain, &eventAttrib2);
+* nvtxDomainDestroy(domain);
+* \endcode
+*
+* \sa
+* ::nvtxDomainDestroy
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* name);
+NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Unregister a NVTX domain.
+*
+* Unregisters the domain handle and frees all domain specific resources.
+*
+* \param domain    - the domain handle
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example");
+* nvtxDomainDestroy(domain);
+* \endcode
+*
+* \sa
+* ::nvtxDomainCreateA
+* ::nvtxDomainCreateW
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain);
+/** @} */
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \cond SHOW_HIDDEN */
+#ifdef UNICODE
+    #define nvtxMark            nvtxMarkW
+    #define nvtxRangeStart      nvtxRangeStartW
+    #define nvtxRangePush       nvtxRangePushW
+    #define nvtxNameCategory    nvtxNameCategoryW
+    #define nvtxNameOsThread    nvtxNameOsThreadW
+    /* NVTX_VERSION_2 */
+    #define nvtxDomainCreate         nvtxDomainCreateW
+    #define nvtxDomainRegisterString nvtxDomainRegisterStringW
+    #define nvtxDomainNameCategory   nvtxDomainNameCategoryW
+#else
+    #define nvtxMark            nvtxMarkA
+    #define nvtxRangeStart      nvtxRangeStartA
+    #define nvtxRangePush       nvtxRangePushA
+    #define nvtxNameCategory    nvtxNameCategoryA
+    #define nvtxNameOsThread    nvtxNameOsThreadA
+    /* NVTX_VERSION_2 */
+    #define nvtxDomainCreate         nvtxDomainCreateA
+    #define nvtxDomainRegisterString nvtxDomainRegisterStringA
+    #define nvtxDomainNameCategory   nvtxDomainNameCategoryA
+#endif
+/** \endcond */
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* NVTOOLSEXT_H_ */

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCuda.h ADDED Viewed

	@@ -0,0 +1,164 @@

+/*
+* Copyright 2009-2017  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+#ifndef NVTOOLSEXT_CUDA_H_
+#define NVTOOLSEXT_CUDA_H_
+#include "cuda.h"
+#include "nvToolsExt.h"
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/* ========================================================================= */
+/** \name Functions for CUDA Resource Naming
+*/
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_CUDA CUDA Resource Naming
+ *
+ * This section covers the API functions that allow to annotate CUDA resources
+ * with user-provided names.
+ *
+ * @{
+ */
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_CUDA  4
+/** \endcond */
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for CUDA
+*/
+typedef enum nvtxResourceCUDAType_t
+{
+    NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
+    NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
+    NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
+    NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4)  /* CUevent */
+} nvtxResourceCUDAType_t;
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA device.
+ *
+ * Allows the user to associate a CUDA device with a user-provided name.
+ *
+ * \param device - The handle of the CUDA device to name.
+ * \param name   - The name of the CUDA device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA context.
+ *
+ * Allows the user to associate a CUDA context with a user-provided name.
+ *
+ * \param context - The handle of the CUDA context to name.
+ * \param name    - The name of the CUDA context.
+ *
+ * \par Example:
+ * \code
+ * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
+ * if ( CUDA_SUCCESS != status )
+ *     goto Error;
+ * nvtxNameCuContext(cuContext, "CTX_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA stream.
+ *
+ * Allows the user to associate a CUDA stream with a user-provided name.
+ *
+ * \param stream - The handle of the CUDA stream to name.
+ * \param name   - The name of the CUDA stream.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA event.
+ *
+ * Allows the user to associate a CUDA event with a user-provided name.
+ *
+ * \param event - The handle of the CUDA event to name.
+ * \param name  - The name of the CUDA event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
+/** @} */
+/** @} */ /* END RESOURCE_NAMING */
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameCuDevice   nvtxNameCuDeviceW
+  #define nvtxNameCuContext  nvtxNameCuContextW
+  #define nvtxNameCuStream   nvtxNameCuStreamW
+  #define nvtxNameCuEvent    nvtxNameCuEventW
+#else
+  #define nvtxNameCuDevice   nvtxNameCuDeviceA
+  #define nvtxNameCuContext  nvtxNameCuContextA
+  #define nvtxNameCuStream   nvtxNameCuStreamA
+  #define nvtxNameCuEvent    nvtxNameCuEventA
+#endif
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* NVTOOLSEXT_CUDA_H_ */

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtCudaRt.h ADDED Viewed

	@@ -0,0 +1,140 @@

+/*
+* Copyright 2009-2017  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+#ifndef NVTOOLSEXT_CUDART_H_
+#define NVTOOLSEXT_CUDART_H_
+#include "cuda.h"
+#include "driver_types.h"
+#include "nvToolsExt.h"
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/* ========================================================================= */
+/** \name Functions for CUDA Resource Naming
+*/
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
+ *
+ * This section covers the API functions that allow to annotate CUDA resources
+ * with user-provided names.
+ *
+ * @{
+ */
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_CUDART 5
+/** \endcond */
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for CUDART
+*/
+typedef enum nvtxResourceCUDARTType_t
+{
+    NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
+    NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
+    NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2)  /* cudaEvent_t */
+} nvtxResourceCUDARTType_t;
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA device.
+ *
+ * Allows the user to associate a CUDA device with a user-provided name.
+ *
+ * \param device - The id of the CUDA device to name.
+ * \param name   - The name of the CUDA device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA stream.
+ *
+ * Allows the user to associate a CUDA stream with a user-provided name.
+ *
+ * \param stream - The handle of the CUDA stream to name.
+ * \param name   - The name of the CUDA stream.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA event.
+ *
+ * Allows the user to associate a CUDA event with a user-provided name.
+ *
+ * \param event - The handle of the CUDA event to name.
+ * \param name  - The name of the CUDA event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
+/** @} */
+/** @} */ /* END RESOURCE_NAMING */
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameCudaDevice nvtxNameCudaDeviceW
+  #define nvtxNameCudaStream nvtxNameCudaStreamW
+  #define nvtxNameCudaEvent  nvtxNameCudaEventW
+#else
+  #define nvtxNameCudaDevice nvtxNameCudaDeviceA
+  #define nvtxNameCudaStream nvtxNameCudaStreamA
+  #define nvtxNameCudaEvent  nvtxNameCudaEventA
+#endif
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* NVTOOLSEXT_CUDART_H_ */

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtOpenCL.h ADDED Viewed

	@@ -0,0 +1,214 @@

+/*
+* Copyright 2009-2017 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+#ifndef NVTOOLSEXT_OPENCL_H_
+#define NVTOOLSEXT_OPENCL_H_
+#include <CL/cl.h>
+#include "nvToolsExt.h"
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/* ========================================================================= */
+/** \name Functions for OpenCL Resource Naming
+ */
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
+ *
+ * This section covers the API functions that allow to annotate OpenCL resources
+ * with user-provided names.
+ *
+ * @{
+ */
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_OPENCL 6
+/** \endcond */
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for OpenCL
+*/
+typedef enum nvtxResourceOpenCLType_t
+{
+    NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
+    NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
+    NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
+    NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
+    NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
+    NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
+    NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7)
+} nvtxResourceOpenCLType_t;
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL device.
+ *
+ * Allows to associate an OpenCL device with a user-provided name.
+ *
+ * \param device - The handle of the OpenCL device to name.
+ * \param name   - The name of the OpenCL device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL context.
+ *
+ * Allows to associate an OpenCL context with a user-provided name.
+ *
+ * \param context - The handle of the OpenCL context to name.
+ * \param name    - The name of the OpenCL context.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL command queue.
+ *
+ * Allows to associate an OpenCL command queue with a user-provided name.
+ *
+ * \param command_queue - The handle of the OpenCL command queue to name.
+ * \param name          - The name of the OpenCL command queue.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL memory object.
+ *
+ * Allows to associate an OpenCL memory object with a user-provided name.
+ *
+ * \param memobj - The handle of the OpenCL memory object to name.
+ * \param name   - The name of the OpenCL memory object.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL sampler.
+ *
+ * Allows to associate an OpenCL sampler with a user-provided name.
+ *
+ * \param sampler - The handle of the OpenCL sampler to name.
+ * \param name    - The name of the OpenCL sampler.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL program.
+ *
+ * Allows to associate an OpenCL program with a user-provided name.
+ *
+ * \param program - The handle of the OpenCL program to name.
+ * \param name    - The name of the OpenCL program.
+ *
+ * \code
+ * cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
+ *     (const char **) &cSourceCL, &program_length, &ciErrNum);
+ * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+ * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL event.
+ *
+ * Allows to associate an OpenCL event with a user-provided name.
+ *
+ * \param evnt - The handle of the OpenCL event to name.
+ * \param name - The name of the OpenCL event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
+/** @} */
+/** @} */ /* END RESOURCE_NAMING */
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameClDevice        nvtxNameClDeviceW
+  #define nvtxNameClContext       nvtxNameClContextW
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueW
+  #define nvtxNameClMemObject     nvtxNameClMemObjectW
+  #define nvtxNameClSampler       nvtxNameClSamplerW
+  #define nvtxNameClProgram       nvtxNameClProgramW
+  #define nvtxNameClEvent         nvtxNameClEventW
+#else
+  #define nvtxNameClDevice        nvtxNameClDeviceA
+  #define nvtxNameClContext       nvtxNameClContextA
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueA
+  #define nvtxNameClMemObject     nvtxNameClMemObjectA
+  #define nvtxNameClSampler       nvtxNameClSamplerA
+  #define nvtxNameClProgram       nvtxNameClProgramA
+  #define nvtxNameClEvent         nvtxNameClEventA
+#endif
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* NVTOOLSEXT_OPENCL_H_ */

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvToolsExtSync.h ADDED Viewed

	@@ -0,0 +1,406 @@

+/*
+* Copyright 2009-2017  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+#ifndef NVTOOLSEXT_SYNC_H_
+#define NVTOOLSEXT_SYNC_H_
+#include "nvToolsExt.h"
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/* \cond SHOW_HIDDEN
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
+/** \endcond */
+/**
+* \page PAGE_SYNCHRONIZATION Synchronization
+*
+* This section covers a subset of the API that allow users to track additional
+* synchronization details of their application.   Naming OS synchronization primitives
+* may allow users to better understand the data collected by traced synchronization
+* APIs.  Additionally, a user defined synchronization object can allow the users to
+* to tell the tools when the user is building their own synchronization system
+* that do not rely on the OS to provide behaviors and instead use techniques like
+* atomic operations and spinlocks.
+*
+* See module \ref SYNCHRONIZATION for details.
+*
+* \par Example:
+* \code
+* class MyMutex
+* {
+*     volatile long bLocked;
+*     nvtxSyncUser_t hSync;
+* public:
+*     MyMutex(const char* name, nvtxDomainHandle_t d){
+*          bLocked = 0;
+*
+*          nvtxSyncUserAttributes_t attribs = { 0 };
+*          attribs.version = NVTX_VERSION;
+*          attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
+*          attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+*          attribs.message.ascii = name;
+*          hSync = nvtxDomainSyncUserCreate(d, &attribs);
+*     }
+*
+*     ~MyMutex() {
+*          nvtxDomainSyncUserDestroy(hSync);
+*     }
+*
+*     bool Lock() {
+*          nvtxDomainSyncUserAcquireStart(hSync);
+*          bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
+*          if (acquired) {
+*              nvtxDomainSyncUserAcquireSuccess(hSync);
+*          }
+*          else {
+*              nvtxDomainSyncUserAcquireFailed(hSync);
+*          }
+*          return acquired;
+*     }
+*     void Unlock() {
+*          nvtxDomainSyncUserReleasing(hSync);
+*          bLocked = false;
+*     }
+* };
+* \endcode
+*
+* \version \NVTX_VERSION_2
+*/
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */
+#define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/
+/** \endcond */
+/*  ------------------------------------------------------------------------- */
+/** \defgroup SYNCHRONIZATION Synchronization
+* See page \ref PAGE_SYNCHRONIZATION.
+* @{
+*/
+/** \brief Resource type values for OSs with POSIX Thread API support
+ */
+typedef enum nvtxResourceSyncPosixThreadType_t
+{
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t  */
+} nvtxResourceSyncPosixThreadType_t;
+/** \brief Resource type values for Windows OSs
+*/
+typedef enum nvtxResourceSyncWindowsType_t
+{
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5)
+} nvtxResourceSyncWindowsType_t;
+/** \brief Resource type values for Linux and Linux derived OSs such as Android
+* \sa
+* ::nvtxResourceSyncPosixThreadType_t
+*/
+typedef enum nvtxResourceSyncLinuxType_t
+{
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7)
+} nvtxResourceSyncLinuxType_t;
+/** \brief Resource type values for Android come from Linux.
+* \sa
+* ::nvtxResourceSyncLinuxType_t
+* ::nvtxResourceSyncPosixThreadType_t
+*/
+typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t;
+/** \brief User Defined Synchronization Object Handle .
+* \anchor SYNCUSER_HANDLE_STRUCTURE
+*
+* This structure is opaque to the user and is used as a handle to reference
+* a user defined syncrhonization object.  The tools will return a pointer through the API for the application
+* to hold on it's behalf to reference the string in the future.
+*
+*/
+typedef struct nvtxSyncUser* nvtxSyncUser_t;
+/** \brief User Defined Synchronization Object Attributes Structure.
+* \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
+*
+* This structure is used to describe the attributes of a user defined synchronization
+* object.  The layout of the structure is defined by a specific version of the tools
+* extension library and can change between different versions of the Tools Extension
+* library.
+*
+* \par Initializing the Attributes
+*
+* The caller should always perform the following three tasks when using
+* attributes:
+* <ul>
+*    <li>Zero the structure
+*    <li>Set the version field
+*    <li>Set the size field
+* </ul>
+*
+* Zeroing the structure sets all the event attributes types and values
+* to the default value.
+*
+* The version and size field are used by the Tools Extension
+* implementation to handle multiple versions of the attributes structure.
+*
+* It is recommended that the caller use one of the following to methods
+* to initialize the event attributes structure:
+*
+* \par Method 1: Initializing nvtxEventAttributes for future compatibility
+* \code
+* nvtxSyncUserAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
+* \endcode
+*
+* \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version
+* \code
+* nvtxSyncUserAttributes_t attribs = {0};
+* attribs.version = 1;
+* attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t));
+* \endcode
+*
+* If the caller uses Method 1 it is critical that the entire binary
+* layout of the structure be configured to 0 so that all fields
+* are initialized to the default value.
+*
+* The caller should either use both NVTX_VERSION and
+* NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
+* and a versioned type (Method 2).  Using a mix of the two methods
+* will likely cause either source level incompatibility or binary
+* incompatibility in the future.
+*
+* \par Settings Attribute Types and Values
+*
+*
+* \par Example:
+* \code
+* // Initialize
+* nvtxSyncUserAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
+*
+* // Configure the Attributes
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Example";
+* \endcode
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+*/
+typedef struct nvtxSyncUserAttributes_v0
+{
+    /**
+    * \brief Version flag of the structure.
+    *
+    * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
+    * supported in this header file. This can optionally be overridden to
+    * another version of the tools extension library.
+    */
+    uint16_t version;
+    /**
+    * \brief Size of the structure.
+    *
+    * Needs to be set to the size in bytes of the event attribute
+    * structure used to specify the event.
+    */
+    uint16_t size;
+    /** \brief Message type specified in this attribute structure.
+    *
+    * Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message
+    * "message" field.
+    *
+    * Default Value is NVTX_MESSAGE_UNKNOWN
+    */
+    int32_t messageType;            /* nvtxMessageType_t */
+    /** \brief Message assigned to this attribute structure.
+    *
+    * The text message that is attached to an event.
+    */
+    nvtxMessageValue_t message;
+} nvtxSyncUserAttributes_v0;
+typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
+/* ------------------------------------------------------------------------- */
+/** \brief Create a user defined synchronization object
+* This is used to track non-OS synchronization working with spinlocks and atomics
+*
+* \param domain - Domain to own the resource
+* \param attribs - A structure to assign multiple attributes to the object.
+*
+* \return A handle that represents the newly created user defined synchronization object.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
+/* ------------------------------------------------------------------------- */
+/** \brief Destroy a user defined synchronization object
+* This is used to track non-OS synchronization working with spinlocks and atomics
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle);
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools that an attempt to acquire a user defined synchronization object
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle);
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools of failure in acquiring a user defined synchronization object
+* This should be called after \ref nvtxDomainSyncUserAcquireStart
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle);
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools of success in acquiring a user defined synchronization object
+* This should be called after \ref nvtxDomainSyncUserAcquireStart.
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle);
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools of releasing a reservation on user defined synchronization object
+* This should be called after \ref nvtxDomainSyncUserAcquireSuccess.
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
+/** @} */ /*END defgroup*/
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#endif /* NVTOOLSEXT_SYNC_H_ */

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExt.h ADDED Viewed

	@@ -0,0 +1,1499 @@

+/*
+* Copyright 2009-2016  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+/** \file nvToolsExt.h
+ */
+/* ========================================================================= */
+/** \mainpage
+ * \tableofcontents
+ * \section INTRODUCTION Introduction
+ *
+ * The NVIDIA Tools Extension library is a set of functions that a
+ * developer can use to provide additional information to tools.
+ * The additional information is used by the tool to improve
+ * analysis and visualization of data.
+ *
+ * The library introduces close to zero overhead if no tool is
+ * attached to the application.  The overhead when a tool is
+ * attached is specific to the tool.
+ *
+ * \section INITIALIZATION_SECTION Initialization
+ *
+ * Typically the tool's library that plugs into NVTX is indirectly
+ * loaded via enviromental properties that are platform specific.
+ * For some platform or special cases, the user may be required
+ * to instead explicity initialize instead though.   This can also
+ * be helpful to control when the API loads a tool's library instead
+ * of what would typically be the first function call to emit info.
+ * For these rare case, see \ref INITIALIZATION for additional information.
+ *
+ * \section MARKERS_AND_RANGES Markers and Ranges
+ *
+ * Markers and ranges are used to describe events at a specific time (markers)
+ * or over a time span (ranges) during the execution of the application
+ * respectively.
+ *
+ * \subsection MARKERS Markers
+ *
+ * Markers denote specific moments in time.
+ *
+ *
+ * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
+ * how to specify the domain.
+ *
+ * \subsection THREAD_RANGES Thread Ranges
+ *
+ * Thread ranges denote nested time ranges. Nesting is maintained per thread
+ * per domain and does not require any additional correlation mechanism. The
+ * duration of a thread range is defined by the corresponding pair of
+ * nvtxRangePush* to nvtxRangePop API calls.
+ *
+ * See \ref DOMAINS and \ref EVENT_ATTRIBUTES for additional information on
+ * how to specify the domain.
+ *
+ * \subsection PROCESS_RANGES Process Ranges
+ *
+ * Process ranges denote a time span that can expose arbitrary concurrency, as
+ * opposed to thread ranges that only support nesting. In addition the range
+ * start event can happen on a different thread than the end marker. For the
+ * correlation of a start/end pair an unique correlation ID is used that is
+ * returned from the start API call and needs to be passed into the end API
+ * call.
+ *
+ * \subsection EVENT_ATTRIBUTES Event Attributes
+ *
+ * \ref MARKERS_AND_RANGES can be annotated with various attributes to provide
+ * additional information for an event or to guide the tool's visualization of
+ * the data. Each of the attributes is optional and if left unused the
+ * attributes fall back to a default value. The attributes include:
+ * - color
+ * - category
+ *
+ * To specify any attribute other than the text message, the \ref
+ * EVENT_ATTRIBUTE_STRUCTURE "Event Attribute Structure" must be used.
+ *
+ * \section DOMAINS Domains
+ *
+ * Domains enable developers to scope annotations. By default all events and
+ * annotations are in the default domain. Additional domains can be registered.
+ * This allows developers to scope markers, ranges, and resources names to
+ * avoid conflicts.
+ *
+ * The function ::nvtxDomainCreateA or ::nvtxDomainCreateW is used to create
+ * a named domain.
+ *
+ * Each domain maintains its own
+ * - categories
+ * - thread range stacks
+ * - registered strings
+ *
+ * The function ::nvtxDomainDestroy marks the end of the domain. Destroying
+ * a domain unregisters and destroys all objects associated with it such as
+ * registered strings, resource objects, named categories, and started ranges.
+ *
+ * \section RESOURCE_NAMING Resource Naming
+ *
+ * This section covers calls that allow to annotate objects with user-provided
+ * names in order to allow for a better analysis of complex trace data. All of
+ * the functions take the handle or the ID of the object to name and the name.
+ * The functions can be called multiple times during the execution of an
+ * application, however, in that case it is implementation dependent which
+ * name will be reported by the tool.
+ *
+ * \subsection CATEGORY_NAMING Category Naming
+ *
+ * Some function in this library support associating an integer category
+ * to enable filtering and sorting.  The category naming functions allow
+ * the application to associate a user friendly name with the integer
+ * category.  Support for domains have been added in NVTX_VERSION_2 to
+ * avoid collisions when domains are developed independantly.
+ *
+ * \subsection RESOURCE_OBJECTS Resource Objects
+ *
+ * Resource objects are a generic mechanism for attaching data to an application
+ * resource.  The identifier field makes the association to a pointer or handle,
+ * while the type field helps provide deeper understanding of the identifier as
+ * well as enabling differentiation in cases where handles generated by different
+ * APIs may collide.  The resource object may also have an associated message to
+ * associate with the application resource, enabling further annotation of this
+ * object and how it is used.
+ *
+ * The resource object was introduced in NVTX_VERSION_2 to supersede existing naming
+ * functions and allow the application resource identified by those functions to be
+ * associated to a domain.  The other naming functions are still supported for backward
+ * compatibility but will be associated only to the default domain.
+ *
+ * \subsection RESOURCE_NAMING_OS Resource Naming
+ *
+ * Some operating system resources creation APIs do not support providing a user friendly
+ * name, such as some OS thread creation APIs.  This API support resource naming though
+ * both through resource objects and functions following the pattern
+ * nvtxName[RESOURCE_TYPE][A|W](identifier, name).  Resource objects introduced in NVTX_VERSION 2
+ * supersede the other functions with a a more general method of assigning names to OS resources,
+ * along with associating them to domains too.  The older nvtxName* functions are only associated
+ * with the default domain.
+ * \section EXTENSIONS Optional Extensions
+ * Optional extensions will either appear within the existing sections the extend or appear
+ * in the "Related Pages" when they introduce new concepts.
+ */
+ /**
+ * Tools Extension API version
+ */
+#if defined(NVTX_VERSION) && NVTX_VERSION < 3
+#error "Trying to #include NVTX version 3 in a source file where an older NVTX version has already been included.  If you are not directly using NVTX (the NVIDIA Tools Extension library), you are getting this error because libraries you are using have included different versions of NVTX.  Suggested solutions are: (1) reorder #includes so the newest NVTX version is included first, (2) avoid using the conflicting libraries in the same .c/.cpp file, or (3) update the library using the older NVTX version to use the newer version instead."
+#endif
+/* Header guard */
+#if !defined(NVTX_VERSION)
+#define NVTX_VERSION 3
+#if defined(_MSC_VER)
+#define NVTX_API __stdcall
+#define NVTX_INLINE_STATIC __inline static
+#else /*defined(__GNUC__)*/
+#define NVTX_API
+#define NVTX_INLINE_STATIC inline static
+#endif /* Platform */
+#if defined(NVTX_NO_IMPL)
+/* When omitting implementation, avoid declaring functions inline */
+/* without definitions, since this causes compiler warnings. */
+#define NVTX_DECLSPEC
+#elif defined(NVTX_EXPORT_API)
+/* Allow overriding definition of NVTX_DECLSPEC when exporting API. */
+/* Default is empty, meaning non-inline with external linkage. */
+#if !defined(NVTX_DECLSPEC)
+#define NVTX_DECLSPEC
+#endif
+#else
+/* Normal NVTX usage defines the NVTX API inline with static */
+/* (internal) linkage. */
+#define NVTX_DECLSPEC NVTX_INLINE_STATIC
+#endif
+#include "nvtxDetail/nvtxLinkOnce.h"
+#define NVTX_VERSIONED_IDENTIFIER_L3(NAME, VERSION) NAME##_v##VERSION
+#define NVTX_VERSIONED_IDENTIFIER_L2(NAME, VERSION) NVTX_VERSIONED_IDENTIFIER_L3(NAME, VERSION)
+#define NVTX_VERSIONED_IDENTIFIER(NAME) NVTX_VERSIONED_IDENTIFIER_L2(NAME, NVTX_VERSION)
+/**
+ * The nvToolsExt library depends on stdint.h.  If the build tool chain in use
+ * does not include stdint.h then define NVTX_STDINT_TYPES_ALREADY_DEFINED
+ * and define the following types:
+ * <ul>
+ *   <li>uint8_t
+ *   <li>int8_t
+ *   <li>uint16_t
+ *   <li>int16_t
+ *   <li>uint32_t
+ *   <li>int32_t
+ *   <li>uint64_t
+ *   <li>int64_t
+ *   <li>uintptr_t
+ *   <li>intptr_t
+ * </ul>
+ * #define NVTX_STDINT_TYPES_ALREADY_DEFINED if you are using your own header file.
+ */
+#ifndef NVTX_STDINT_TYPES_ALREADY_DEFINED
+#include <stdint.h>
+#endif
+#include <stddef.h>
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/**
+* Result Codes
+*/
+#define NVTX_SUCCESS 0
+#define NVTX_FAIL 1
+#define NVTX_ERR_INIT_LOAD_PROPERTY 2
+#define NVTX_ERR_INIT_ACCESS_LIBRARY 3
+#define NVTX_ERR_INIT_LOAD_LIBRARY 4
+#define NVTX_ERR_INIT_MISSING_LIBRARY_ENTRY_POINT 5
+#define NVTX_ERR_INIT_FAILED_LIBRARY_ENTRY_POINT 6
+#define NVTX_ERR_NO_INJECTION_LIBRARY_AVAILABLE 7
+/**
+ * Size of the nvtxEventAttributes_t structure.
+ */
+#define NVTX_EVENT_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxEventAttributes_t) ) )
+#define NVTX_NO_PUSH_POP_TRACKING ((int)-2)
+typedef uint64_t nvtxRangeId_t;
+/* Forward declaration of opaque domain registration structure */
+struct nvtxDomainRegistration_st;
+typedef struct nvtxDomainRegistration_st nvtxDomainRegistration;
+/* \brief Domain Handle Structure.
+* \anchor DOMAIN_HANDLE_STRUCTURE
+*
+* This structure is opaque to the user and is used as a handle to reference
+* a domain.  This type is returned from tools when using the NVTX API to
+* create a domain.
+*
+*/
+typedef nvtxDomainRegistration* nvtxDomainHandle_t;
+/* Forward declaration of opaque string registration structure */
+struct nvtxStringRegistration_st;
+typedef struct nvtxStringRegistration_st nvtxStringRegistration;
+/* \brief Registered String Handle Structure.
+* \anchor REGISTERED_STRING_HANDLE_STRUCTURE
+*
+* This structure is opaque to the user and is used as a handle to reference
+* a registered string.  This type is returned from tools when using the NVTX
+* API to create a registered string.
+*
+*/
+typedef nvtxStringRegistration* nvtxStringHandle_t;
+/* ========================================================================= */
+/** \defgroup GENERAL General
+ * @{
+ */
+/** ---------------------------------------------------------------------------
+ * Color Types
+ * ------------------------------------------------------------------------- */
+typedef enum nvtxColorType_t
+{
+    NVTX_COLOR_UNKNOWN  = 0,                 /**< Color attribute is unused. */
+    NVTX_COLOR_ARGB     = 1                  /**< An ARGB color is provided. */
+} nvtxColorType_t;
+/** ---------------------------------------------------------------------------
+ * Message Types
+ * ------------------------------------------------------------------------- */
+typedef enum nvtxMessageType_t
+{
+    NVTX_MESSAGE_UNKNOWN          = 0,    /**< Message payload is unused. */
+    NVTX_MESSAGE_TYPE_ASCII       = 1,    /**< A character sequence is used as payload. */
+    NVTX_MESSAGE_TYPE_UNICODE     = 2,     /**< A wide character sequence is used as payload. */
+    /* NVTX_VERSION_2 */
+    NVTX_MESSAGE_TYPE_REGISTERED  = 3,    /**< A unique string handle that was registered
+                                                with \ref nvtxDomainRegisterStringA() or
+                                                \ref nvtxDomainRegisterStringW(). */
+} nvtxMessageType_t;
+typedef union nvtxMessageValue_t
+{
+    const char* ascii;
+    const wchar_t* unicode;
+    /* NVTX_VERSION_2 */
+    nvtxStringHandle_t registered;
+} nvtxMessageValue_t;
+/** @} */ /*END defgroup*/
+/* ------------------------------------------------------------------------- */
+/** \brief Force initialization (optional)
+*
+* Force NVTX library to initialize.  The first call to any NVTX API function
+* will automatically initialize the entire API.  This can make the first call
+* much slower than subsequent calls.  In applications where the first call to
+* NVTX may be in a performance-critical section, calling nvtxInitialize before
+* any performance-critical sections will ensure NVTX initialization occurs at
+* an acceptable time.  Since nvtxInitialize takes no parameters and has no
+* expected behavior besides initialization, it is convenient to add a call to
+* nvtxInitialize in NVTX-instrumented applications that need to force earlier
+* initialization without changing any other code.  For example, if an app's
+* first NVTX call is nvtxDomainCreate, and it is difficult to move that call
+* earlier because the domain handle must be stored in an object only created
+* at that point, adding a call to nvtxInitialize at the top of main() will
+* ensure the later call to nvtxDomainCreate is as fast as possible.
+*
+* \version \NVTX_VERSION_3
+*
+* \param reserved - must be zero or NULL.
+*
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxInitialize(const void* reserved);
+/** @} */
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup EVENT_ATTRIBUTES Event Attributes
+* @{
+*/
+/** ---------------------------------------------------------------------------
+* Payload Types
+* ------------------------------------------------------------------------- */
+typedef enum nvtxPayloadType_t
+{
+    NVTX_PAYLOAD_UNKNOWN = 0,   /**< Color payload is unused. */
+    NVTX_PAYLOAD_TYPE_UNSIGNED_INT64 = 1,   /**< A 64 bit unsigned integer value is used as payload. */
+    NVTX_PAYLOAD_TYPE_INT64 = 2,   /**< A 64 bit signed integer value is used as payload. */
+    NVTX_PAYLOAD_TYPE_DOUBLE = 3,   /**< A 64 bit floating point value is used as payload. */
+    /* NVTX_VERSION_2 */
+    NVTX_PAYLOAD_TYPE_UNSIGNED_INT32 = 4,   /**< A 32 bit floating point value is used as payload. */
+    NVTX_PAYLOAD_TYPE_INT32 = 5,   /**< A 32 bit floating point value is used as payload. */
+    NVTX_PAYLOAD_TYPE_FLOAT = 6    /**< A 32 bit floating point value is used as payload. */
+} nvtxPayloadType_t;
+/** \brief Event Attribute Structure.
+ * \anchor EVENT_ATTRIBUTE_STRUCTURE
+ *
+ * This structure is used to describe the attributes of an event. The layout of
+ * the structure is defined by a specific version of the tools extension
+ * library and can change between different versions of the Tools Extension
+ * library.
+ *
+ * \par Initializing the Attributes
+ *
+ * The caller should always perform the following three tasks when using
+ * attributes:
+ * <ul>
+ *    <li>Zero the structure
+ *    <li>Set the version field
+ *    <li>Set the size field
+ * </ul>
+ *
+ * Zeroing the structure sets all the event attributes types and values
+ * to the default value.
+ *
+ * The version and size field are used by the Tools Extension
+ * implementation to handle multiple versions of the attributes structure.
+ *
+ * It is recommended that the caller use one of the following to methods
+ * to initialize the event attributes structure:
+ *
+ * \par Method 1: Initializing nvtxEventAttributes for future compatibility
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * \endcode
+ *
+ * \par Method 2: Initializing nvtxEventAttributes for a specific version
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = 1;
+ * eventAttrib.size = (uint16_t)(sizeof(nvtxEventAttributes_v1));
+ * \endcode
+ *
+ * If the caller uses Method 1 it is critical that the entire binary
+ * layout of the structure be configured to 0 so that all fields
+ * are initialized to the default value.
+ *
+ * The caller should either use both NVTX_VERSION and
+ * NVTX_EVENT_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
+ * and a versioned type (Method 2).  Using a mix of the two methods
+ * will likely cause either source level incompatibility or binary
+ * incompatibility in the future.
+ *
+ * \par Settings Attribute Types and Values
+ *
+ *
+ * \par Example:
+ * \code
+ * // Initialize
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ *
+ * // Configure the Attributes
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFF880000;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Example";
+ * \endcode
+ *
+ * In the example the caller does not have to set the value of
+ * \ref ::nvtxEventAttributes_v2::category or
+ * \ref ::nvtxEventAttributes_v2::payload as these fields were set to
+ * the default value by {0}.
+ * \sa
+ * ::nvtxDomainMarkEx
+ * ::nvtxDomainRangeStartEx
+ * ::nvtxDomainRangePushEx
+ */
+typedef struct nvtxEventAttributes_v2
+{
+    /**
+    * \brief Version flag of the structure.
+    *
+    * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
+    * supported in this header file. This can optionally be overridden to
+    * another version of the tools extension library.
+    */
+    uint16_t version;
+    /**
+    * \brief Size of the structure.
+    *
+    * Needs to be set to the size in bytes of the event attribute
+    * structure used to specify the event.
+    */
+    uint16_t size;
+    /**
+     * \brief ID of the category the event is assigned to.
+     *
+     * A category is a user-controlled ID that can be used to group
+     * events.  The tool may use category IDs to improve filtering or
+     * enable grouping of events in the same category. The functions
+     * \ref ::nvtxNameCategoryA or \ref ::nvtxNameCategoryW can be used
+     * to name a category.
+     *
+     * Default Value is 0
+     */
+    uint32_t category;
+    /** \brief Color type specified in this attribute structure.
+     *
+     * Defines the color format of the attribute structure's \ref COLOR_FIELD
+     * "color" field.
+     *
+     * Default Value is NVTX_COLOR_UNKNOWN
+     */
+    int32_t colorType;              /* nvtxColorType_t */
+    /** \brief Color assigned to this event. \anchor COLOR_FIELD
+     *
+     * The color that the tool should use to visualize the event.
+     */
+    uint32_t color;
+    /**
+     * \brief Payload type specified in this attribute structure.
+     *
+     * Defines the payload format of the attribute structure's \ref PAYLOAD_FIELD
+     * "payload" field.
+     *
+     * Default Value is NVTX_PAYLOAD_UNKNOWN
+     */
+    int32_t payloadType;            /* nvtxPayloadType_t */
+    int32_t reserved0;
+    /**
+     * \brief Payload assigned to this event. \anchor PAYLOAD_FIELD
+     *
+     * A numerical value that can be used to annotate an event. The tool could
+     * use the payload data to reconstruct graphs and diagrams.
+     */
+    union payload_t
+    {
+        uint64_t ullValue;
+        int64_t llValue;
+        double dValue;
+        /* NVTX_VERSION_2 */
+        uint32_t uiValue;
+        int32_t iValue;
+        float fValue;
+    } payload;
+    /** \brief Message type specified in this attribute structure.
+     *
+     * Defines the message format of the attribute structure's \ref MESSAGE_FIELD
+     * "message" field.
+     *
+     * Default Value is NVTX_MESSAGE_UNKNOWN
+     */
+    int32_t messageType;            /* nvtxMessageType_t */
+    /** \brief Message assigned to this attribute structure. \anchor MESSAGE_FIELD
+     *
+     * The text message that is attached to an event.
+     */
+    nvtxMessageValue_t message;
+} nvtxEventAttributes_v2;
+typedef struct nvtxEventAttributes_v2 nvtxEventAttributes_t;
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup MARKERS_AND_RANGES Markers and Ranges
+ *
+ * See \ref MARKERS_AND_RANGES for more details
+ *
+ * @{
+ */
+/** \name Marker */
+/* ------------------------------------------------------------------------- */
+/** \brief Marks an instantaneous event in the application.
+*
+* A marker can contain a text message or specify additional information
+* using the event attributes structure.  These attributes include a text
+* message, color, category, and a payload. Each of the attributes is optional
+* and can only be sent out using the \ref nvtxDomainMarkEx function.
+*
+* nvtxDomainMarkEx(NULL, event) is equivalent to calling
+* nvtxMarkEx(event).
+*
+* \param domain    - The domain of scoping the category.
+* \param eventAttrib - The event attribute structure defining the marker's
+* attribute types and attribute values.
+*
+* \sa
+* ::nvtxMarkEx
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainMarkEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Marks an instantaneous event in the application.
+ *
+ * A marker can contain a text message or specify additional information
+ * using the event attributes structure.  These attributes include a text
+ * message, color, category, and a payload. Each of the attributes is optional
+ * and can only be sent out using the \ref nvtxMarkEx function.
+ * If \ref nvtxMarkA or \ref nvtxMarkW are used to specify the marker
+ * or if an attribute is unspecified then a default value will be used.
+ *
+ * \param eventAttrib - The event attribute structure defining the marker's
+ * attribute types and attribute values.
+ *
+ * \par Example:
+ * \code
+ * // zero the structure
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * // set the version and the size information
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * // configure the attributes.  0 is the default for all attributes.
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFF880000;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Example nvtxMarkEx";
+ * nvtxMarkEx(&eventAttrib);
+ * \endcode
+ *
+ * \sa
+ * ::nvtxDomainMarkEx
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxMarkEx(const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Marks an instantaneous event in the application.
+ *
+ * A marker created using \ref nvtxMarkA or \ref nvtxMarkW contains only a
+ * text message.
+ *
+ * \param message     - The message associated to this marker event.
+ *
+ * \par Example:
+ * \code
+ * nvtxMarkA("Example nvtxMarkA");
+ * nvtxMarkW(L"Example nvtxMarkW");
+ * \endcode
+ *
+ * \sa
+ * ::nvtxDomainMarkEx
+ * ::nvtxMarkEx
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxMarkA(const char* message);
+NVTX_DECLSPEC void NVTX_API nvtxMarkW(const wchar_t* message);
+/** @} */
+/** \name Process Ranges */
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a process range in a domain.
+*
+* \param domain    - The domain of scoping the category.
+* \param eventAttrib - The event attribute structure defining the range's
+* attribute types and attribute values.
+*
+* \return The unique ID used to correlate a pair of Start and End events.
+*
+* \remarks Ranges defined by Start/End can overlap.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain");
+* nvtxEventAttributes_t eventAttrib = {0};
+* eventAttrib.version = NVTX_VERSION;
+* eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* eventAttrib.message.ascii = "my range";
+* nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib);
+* // ...
+* nvtxDomainRangeEnd(rangeId);
+* \endcode
+*
+* \sa
+* ::nvtxDomainRangeEnd
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxDomainRangeStartEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a process range.
+ *
+ * \param eventAttrib - The event attribute structure defining the range's
+ * attribute types and attribute values.
+ *
+ * \return The unique ID used to correlate a pair of Start and End events.
+ *
+ * \remarks Ranges defined by Start/End can overlap.
+ *
+ * \par Example:
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * eventAttrib.category = 3;
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFF0088FF;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Example Range";
+ * nvtxRangeId_t rangeId = nvtxRangeStartEx(&eventAttrib);
+ * // ...
+ * nvtxRangeEnd(rangeId);
+ * \endcode
+ *
+ * \sa
+ * ::nvtxRangeEnd
+ * ::nvtxDomainRangeStartEx
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartEx(const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a process range.
+ *
+ * \param message     - The event message associated to this range event.
+ *
+ * \return The unique ID used to correlate a pair of Start and End events.
+ *
+ * \remarks Ranges defined by Start/End can overlap.
+ *
+ * \par Example:
+ * \code
+ * nvtxRangeId_t r1 = nvtxRangeStartA("Range 1");
+ * nvtxRangeId_t r2 = nvtxRangeStartW(L"Range 2");
+ * nvtxRangeEnd(r1);
+ * nvtxRangeEnd(r2);
+ * \endcode
+ *
+ * \sa
+ * ::nvtxRangeEnd
+ * ::nvtxRangeStartEx
+ * ::nvtxDomainRangeStartEx
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartA(const char* message);
+NVTX_DECLSPEC nvtxRangeId_t NVTX_API nvtxRangeStartW(const wchar_t* message);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Ends a process range.
+*
+* \param domain - The domain
+* \param id - The correlation ID returned from a nvtxRangeStart call.
+*
+* \remarks This function is offered completeness but is an alias for ::nvtxRangeEnd.
+* It does not need a domain param since that is associated iwth the range ID at ::nvtxDomainRangeStartEx
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("my domain");
+* nvtxEventAttributes_t eventAttrib = {0};
+* eventAttrib.version = NVTX_VERSION;
+* eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* eventAttrib.message.ascii = "my range";
+* nvtxRangeId_t rangeId = nvtxDomainRangeStartEx(&eventAttrib);
+* // ...
+* nvtxDomainRangeEnd(rangeId);
+* \endcode
+*
+* \sa
+* ::nvtxDomainRangeStartEx
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainRangeEnd(nvtxDomainHandle_t domain, nvtxRangeId_t id);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Ends a process range.
+ *
+ * \param id - The correlation ID returned from an nvtxRangeStart call.
+ *
+ * \sa
+ * ::nvtxDomainRangeStartEx
+ * ::nvtxRangeStartEx
+ * ::nvtxRangeStartA
+ * ::nvtxRangeStartW
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxRangeEnd(nvtxRangeId_t id);
+/** @} */
+/** \name Thread Ranges */
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a nested thread range.
+*
+* \param domain    - The domain of scoping.
+* \param eventAttrib - The event attribute structure defining the range's
+* attribute types and attribute values.
+*
+* \return The 0 based level of range being started. This value is scoped to the domain.
+* If an error occurs, a negative value is returned.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
+* nvtxEventAttributes_t eventAttrib = {0};
+* eventAttrib.version = NVTX_VERSION;
+* eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib.colorType = NVTX_COLOR_ARGB;
+* eventAttrib.color = 0xFFFF0000;
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* eventAttrib.message.ascii = "Level 0";
+* nvtxDomainRangePushEx(domain, &eventAttrib);
+*
+* // Re-use eventAttrib
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE;
+* eventAttrib.message.unicode = L"Level 1";
+* nvtxDomainRangePushEx(domain, &eventAttrib);
+*
+* nvtxDomainRangePop(domain); //level 1
+* nvtxDomainRangePop(domain); //level 0
+* \endcode
+*
+* \sa
+* ::nvtxDomainRangePop
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC int NVTX_API nvtxDomainRangePushEx(nvtxDomainHandle_t domain, const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a nested thread range.
+ *
+ * \param eventAttrib - The event attribute structure defining the range's
+ * attribute types and attribute values.
+ *
+ * \return The 0 based level of range being started. This level is per domain.
+ * If an error occurs a negative value is returned.
+ *
+ * \par Example:
+ * \code
+ * nvtxEventAttributes_t eventAttrib = {0};
+ * eventAttrib.version = NVTX_VERSION;
+ * eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+ * eventAttrib.colorType = NVTX_COLOR_ARGB;
+ * eventAttrib.color = 0xFFFF0000;
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
+ * eventAttrib.message.ascii = "Level 0";
+ * nvtxRangePushEx(&eventAttrib);
+ *
+ * // Re-use eventAttrib
+ * eventAttrib.messageType = NVTX_MESSAGE_TYPE_UNICODE;
+ * eventAttrib.message.unicode = L"Level 1";
+ * nvtxRangePushEx(&eventAttrib);
+ *
+ * nvtxRangePop();
+ * nvtxRangePop();
+ * \endcode
+ *
+ * \sa
+ * ::nvtxDomainRangePushEx
+ * ::nvtxRangePop
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePushEx(const nvtxEventAttributes_t* eventAttrib);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Starts a nested thread range.
+ *
+ * \param message     - The event message associated to this range event.
+ *
+ * \return The 0 based level of range being started.  If an error occurs a
+ * negative value is returned.
+ *
+ * \par Example:
+ * \code
+ * nvtxRangePushA("Level 0");
+ * nvtxRangePushW(L"Level 1");
+ * nvtxRangePop();
+ * nvtxRangePop();
+ * \endcode
+ *
+ * \sa
+ * ::nvtxDomainRangePushEx
+ * ::nvtxRangePop
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePushA(const char* message);
+NVTX_DECLSPEC int NVTX_API nvtxRangePushW(const wchar_t* message);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Ends a nested thread range.
+*
+* \return The level of the range being ended. If an error occurs a negative
+* value is returned on the current thread.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreate("example library");
+* nvtxDomainRangePushA(domain, "Level 0");
+* nvtxDomainRangePushW(domain, L"Level 1");
+* nvtxDomainRangePop(domain);
+* nvtxDomainRangePop(domain);
+* \endcode
+*
+* \sa
+* ::nvtxRangePushEx
+* ::nvtxRangePushA
+* ::nvtxRangePushW
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC int NVTX_API nvtxDomainRangePop(nvtxDomainHandle_t domain);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Ends a nested thread range.
+ *
+ * \return The level of the range being ended. If an error occurs a negative
+ * value is returned on the current thread.
+ *
+ * \par Example:
+ * \code
+ * nvtxRangePushA("Level 0");
+ * nvtxRangePushW(L"Level 1");
+ * nvtxRangePop();
+ * nvtxRangePop();
+ * \endcode
+ *
+ * \sa
+ * ::nvtxRangePushEx
+ * ::nvtxRangePushA
+ * ::nvtxRangePushW
+ *
+ * \version \NVTX_VERSION_0
+ * @{ */
+NVTX_DECLSPEC int NVTX_API nvtxRangePop(void);
+/** @} */
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup RESOURCE_NAMING Resource Naming
+ *
+ * See \ref RESOURCE_NAMING for more details
+ *
+ * @{
+ */
+/*  ------------------------------------------------------------------------- */
+/** \name Functions for Generic Resource Naming*/
+/*  ------------------------------------------------------------------------- */
+/*  ------------------------------------------------------------------------- */
+/** \cond SHOW_HIDDEN
+* \brief Resource typing helpers.
+*
+* Classes are used to make it easy to create a series of resource types
+* per API without collisions
+*/
+#define NVTX_RESOURCE_MAKE_TYPE(CLASS, INDEX) ((((uint32_t)(NVTX_RESOURCE_CLASS_ ## CLASS))<<16)|((uint32_t)(INDEX)))
+#define NVTX_RESOURCE_CLASS_GENERIC 1
+/** \endcond */
+/* ------------------------------------------------------------------------- */
+/** \brief Generic resource type for when a resource class is not available.
+*
+* \sa
+* ::nvtxDomainResourceCreate
+*
+* \version \NVTX_VERSION_2
+*/
+typedef enum nvtxResourceGenericType_t
+{
+    NVTX_RESOURCE_TYPE_UNKNOWN = 0,
+    NVTX_RESOURCE_TYPE_GENERIC_POINTER = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 1), /**< Generic pointer assumed to have no collisions with other pointers. */
+    NVTX_RESOURCE_TYPE_GENERIC_HANDLE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 2), /**< Generic handle assumed to have no collisions with other handles. */
+    NVTX_RESOURCE_TYPE_GENERIC_THREAD_NATIVE = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 3), /**< OS native thread identifier. */
+    NVTX_RESOURCE_TYPE_GENERIC_THREAD_POSIX = NVTX_RESOURCE_MAKE_TYPE(GENERIC, 4) /**< POSIX pthread identifier. */
+} nvtxResourceGenericType_t;
+/** \brief Resource Attribute Structure.
+* \anchor RESOURCE_ATTRIBUTE_STRUCTURE
+*
+* This structure is used to describe the attributes of a resource. The layout of
+* the structure is defined by a specific version of the tools extension
+* library and can change between different versions of the Tools Extension
+* library.
+*
+* \par Initializing the Attributes
+*
+* The caller should always perform the following three tasks when using
+* attributes:
+* <ul>
+*    <li>Zero the structure
+*    <li>Set the version field
+*    <li>Set the size field
+* </ul>
+*
+* Zeroing the structure sets all the resource attributes types and values
+* to the default value.
+*
+* The version and size field are used by the Tools Extension
+* implementation to handle multiple versions of the attributes structure.
+*
+* It is recommended that the caller use one of the following to methods
+* to initialize the event attributes structure:
+*
+* \par Method 1: Initializing nvtxEventAttributes for future compatibility
+* \code
+* nvtxResourceAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
+* \endcode
+*
+* \par Method 2: Initializing nvtxEventAttributes for a specific version
+* \code
+* nvtxResourceAttributes_v0 attribs = {0};
+* attribs.version = 2;
+* attribs.size = (uint16_t)(sizeof(nvtxResourceAttributes_v0));
+* \endcode
+*
+* If the caller uses Method 1 it is critical that the entire binary
+* layout of the structure be configured to 0 so that all fields
+* are initialized to the default value.
+*
+* The caller should either use both NVTX_VERSION and
+* NVTX_RESOURCE_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
+* and a versioned type (Method 2).  Using a mix of the two methods
+* will likely cause either source level incompatibility or binary
+* incompatibility in the future.
+*
+* \par Settings Attribute Types and Values
+*
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
+*
+* // Initialize
+* nvtxResourceAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
+*
+* // Configure the Attributes
+* attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
+* attribs.identifier.pValue = (const void*)pMutex;
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Single thread access to database.";
+*
+* nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
+* \endcode
+*
+* \sa
+* ::nvtxDomainResourceCreate
+*/
+typedef struct nvtxResourceAttributes_v0
+{
+    /**
+    * \brief Version flag of the structure.
+    *
+    * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
+    * supported in this header file. This can optionally be overridden to
+    * another version of the tools extension library.
+    */
+    uint16_t version;
+    /**
+    * \brief Size of the structure.
+    *
+    * Needs to be set to the size in bytes of this attribute
+    * structure.
+    */
+    uint16_t size;
+    /**
+    * \brief Identifier type specifies how to interpret the identifier field
+    *
+    * Defines the identifier format of the attribute structure's \ref RESOURCE_IDENTIFIER_FIELD
+    * "identifier" field.
+    *
+    * Default Value is NVTX_RESOURCE_TYPE_UNKNOWN
+    */
+    int32_t identifierType;            /* values from enums following the pattern nvtxResource[name]Type_t */
+    /**
+    * \brief Identifier for the resource.
+    * \anchor RESOURCE_IDENTIFIER_FIELD
+    *
+    * An identifier may be a pointer or a handle to an OS or middleware API object.
+    * The resource type will assist in avoiding collisions where handles values may collide.
+    */
+    union identifier_t
+    {
+        const void* pValue;
+        uint64_t ullValue;
+    } identifier;
+    /** \brief Message type specified in this attribute structure.
+    *
+    * Defines the message format of the attribute structure's \ref RESOURCE_MESSAGE_FIELD
+    * "message" field.
+    *
+    * Default Value is NVTX_MESSAGE_UNKNOWN
+    */
+    int32_t messageType;            /* nvtxMessageType_t */
+    /** \brief Message assigned to this attribute structure. \anchor RESOURCE_MESSAGE_FIELD
+    *
+    * The text message that is attached to a resource.
+    */
+    nvtxMessageValue_t message;
+} nvtxResourceAttributes_v0;
+typedef struct nvtxResourceAttributes_v0 nvtxResourceAttributes_t;
+/* \cond SHOW_HIDDEN
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxResourceAttributes_v0) ) )
+typedef struct nvtxResourceHandle* nvtxResourceHandle_t;
+/** \endcond */
+/* ------------------------------------------------------------------------- */
+/** \brief Create a resource object to track and associate data with OS and middleware objects
+*
+* Allows users to associate an API handle or pointer with a user-provided name.
+*
+*
+* \param domain - Domain to own the resource object
+* \param attribs - Attributes to be associated with the resource
+*
+* \return A handle that represents the newly created resource object.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
+* nvtxResourceAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
+* attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
+* attribs.identifier.pValue = (const void*)pMutex;
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Single thread access to database.";
+* nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
+* \endcode
+*
+* \sa
+* ::nvtxResourceAttributes_t
+* ::nvtxDomainResourceDestroy
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC nvtxResourceHandle_t NVTX_API nvtxDomainResourceCreate(nvtxDomainHandle_t domain, nvtxResourceAttributes_t* attribs);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Destroy a resource object to track and associate data with OS and middleware objects
+*
+* Allows users to associate an API handle or pointer with a user-provided name.
+*
+* \param resource - Handle to the resource in which to operate.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example domain");
+* nvtxResourceAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_RESOURCE_ATTRIB_STRUCT_SIZE;
+* attribs.identifierType = NVTX_RESOURCE_TYPE_GENERIC_POINTER;
+* attribs.identifier.pValue = (const void*)pMutex;
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Single thread access to database.";
+* nvtxResourceHandle_t handle = nvtxDomainResourceCreate(domain, attribs);
+* nvtxDomainResourceDestroy(handle);
+* \endcode
+*
+* \sa
+* ::nvtxDomainResourceCreate
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainResourceDestroy(nvtxResourceHandle_t resource);
+/** @} */
+/** \name Functions for NVTX Category Naming*/
+/* ------------------------------------------------------------------------- */
+/**
+* \brief Annotate an NVTX category used within a domain.
+*
+* Categories are used to group sets of events. Each category is identified
+* through a unique ID and that ID is passed into any of the marker/range
+* events to assign that event to a specific category. The nvtxDomainNameCategory
+* function calls allow the user to assign a name to a category ID that is
+* specific to the domain.
+*
+* nvtxDomainNameCategory(NULL, category, name) is equivalent to calling
+* nvtxNameCategory(category, name).
+*
+* \param domain    - The domain of scoping the category.
+* \param category  - The category ID to name.
+* \param name      - The name of the category.
+*
+* \remarks The category names are tracked per domain.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("example");
+* nvtxDomainNameCategoryA(domain, 1, "Memory Allocation");
+* nvtxDomainNameCategoryW(domain, 2, L"Memory Transfer");
+* \endcode
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryA(nvtxDomainHandle_t domain, uint32_t category, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxDomainNameCategoryW(nvtxDomainHandle_t domain, uint32_t category, const wchar_t* name);
+/** @} */
+/** \brief Annotate an NVTX category.
+ *
+ * Categories are used to group sets of events. Each category is identified
+ * through a unique ID and that ID is passed into any of the marker/range
+ * events to assign that event to a specific category. The nvtxNameCategory
+ * function calls allow the user to assign a name to a category ID.
+ *
+ * \param category - The category ID to name.
+ * \param name     - The name of the category.
+ *
+ * \remarks The category names are tracked per process.
+ *
+ * \par Example:
+ * \code
+ * nvtxNameCategory(1, "Memory Allocation");
+ * nvtxNameCategory(2, "Memory Transfer");
+ * nvtxNameCategory(3, "Memory Object Lifetime");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCategoryA(uint32_t category, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCategoryW(uint32_t category, const wchar_t* name);
+/** @} */
+/** \name Functions for OS Threads Naming*/
+/* ------------------------------------------------------------------------- */
+/** \brief Annotate an OS thread.
+ *
+ * Allows the user to name an active thread of the current process. If an
+ * invalid thread ID is provided or a thread ID from a different process is
+ * used the behavior of the tool is implementation dependent.
+ *
+ * Tools expect thread ID to be a number that uniquely identifies the thread
+ * at the time of the call. Note that a thread's ID can be reused after
+ * it is destroyed. Tools may choose how to handle aliasing of thread IDs.
+ *
+ * POSIX pthread_t type returned by pthread_self() may not comply with these
+ * expectations. Please use OS-specific thread ID instead of pthread_t.
+ *
+ * The thread name is associated to the default domain.  To support domains
+ * use resource objects via ::nvtxDomainResourceCreate.
+ *
+ * \param threadId - The ID of the thread to name.
+ * \param name     - The name of the thread.
+ *
+ * \par Examples:
+ * MS Windows:
+ * \code
+ * #include <windows.h>
+ * nvtxNameOsThread(GetCurrentThreadId(), "Current thread");
+ * nvtxNameOsThread(GetThreadId(SomeThreadHandle), "Other thread");
+ * \endcode
+ *
+ * Android:
+ * \code
+ * #include <unistd.h>
+ * nvtxNameOsThreadA(gettid(), "Current thread");
+ * nvtxNameOsThreadA(getpid(), "Main thread");
+ * \endcode
+ *
+ * Linux:
+ * \code
+ * #include <sys/syscall.h>
+ * nvtxNameOsThreadA(syscall(SYS_gettid), "Current thread");
+ * \endcode
+ * \code
+ * #include <unistd.h>
+ * nvtxNameOsThreadA(getpid(), "Main thread");
+ * \endcode
+ *
+ * OS X:
+ * \code
+ * #include <sys/syscall.h>
+ * nvtxNameOsThreadA(syscall(SYS_thread_selfid), "Current thread");
+ * \endcode
+ * \code
+ * #include <pthread.h>
+ * __uint64_t id;
+ * pthread_threadid_np(pthread_self(), &id);
+ * nvtxNameOsThreadA(id, "Current thread");
+ * pthread_threadid_np(somePThreadId, &id);
+ * nvtxNameOsThreadA(id, "Other thread");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadA(uint32_t threadId, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameOsThreadW(uint32_t threadId, const wchar_t* name);
+/** @} */
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup STRING_REGISTRATION String Registration
+*
+* Registered strings are intended to increase performance by lowering instrumentation
+* overhead.  String may be registered once and the handle may be passed in place of
+* a string where an the APIs may allow.
+*
+* See \ref STRING_REGISTRATION for more details
+*
+* @{
+*/
+/* ------------------------------------------------------------------------- */
+/** \brief Register a string.
+* Registers an immutable string with NVTX. Once registered the pointer used
+* to register the domain name can be used in nvtxEventAttributes_t
+* \ref MESSAGE_FIELD. This allows NVTX implementation to skip copying the
+* contents of the message on each event invocation.
+*
+* String registration is an optimization. It is recommended to use string
+* registration if the string will be passed to an event many times.
+*
+* String are not unregistered, except that by unregistering the entire domain
+*
+* \param domain  - Domain handle. If NULL then the global domain is used.
+* \param string    - A unique pointer to a sequence of characters.
+*
+* \return A handle representing the registered string.
+*
+* \par Example:
+* \code
+* nvtxDomainCreateA("com.nvidia.nvtx.example");
+* nvtxStringHandle_t message = nvtxDomainRegisterStringA(domain, "registered string");
+* nvtxEventAttributes_t eventAttrib = {0};
+* eventAttrib.version = NVTX_VERSION;
+* eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
+* eventAttrib.message.registered = message;
+* \endcode
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringA(nvtxDomainHandle_t domain, const char* string);
+NVTX_DECLSPEC nvtxStringHandle_t NVTX_API nvtxDomainRegisterStringW(nvtxDomainHandle_t domain, const wchar_t* string);
+/** @} */
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \defgroup DOMAINS Domains
+*
+* Domains are used to group events to a developer defined scope. Middleware
+* vendors may also scope their own events to avoid collisions with the
+* the application developer's events, so that the application developer may
+* inspect both parts and easily differentiate or filter them.  By default
+* all events are scoped to a global domain where NULL is provided or when
+* using APIs provided b versions of NVTX below v2
+*
+* Domains are intended to be typically long lived objects with the intention
+* of logically separating events of large modules from each other such as
+* middleware libraries from each other and the main application.
+*
+* See \ref DOMAINS for more details
+*
+* @{
+*/
+/* ------------------------------------------------------------------------- */
+/** \brief Register a NVTX domain.
+*
+* Domains are used to scope annotations. All NVTX_VERSION_0 and NVTX_VERSION_1
+* annotations are scoped to the global domain. The function nvtxDomainCreate
+* creates a new named domain.
+*
+* Each domain maintains its own nvtxRangePush and nvtxRangePop stack.
+*
+* \param name - A unique string representing the domain.
+*
+* \return A handle representing the domain.
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example");
+*
+* nvtxMarkA("nvtxMarkA to global domain");
+*
+* nvtxEventAttributes_t eventAttrib1 = {0};
+* eventAttrib1.version = NVTX_VERSION;
+* eventAttrib1.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib1.message.ascii = "nvtxDomainMarkEx to global domain";
+* nvtxDomainMarkEx(NULL, &eventAttrib1);
+*
+* nvtxEventAttributes_t eventAttrib2 = {0};
+* eventAttrib2.version = NVTX_VERSION;
+* eventAttrib2.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+* eventAttrib2.message.ascii = "nvtxDomainMarkEx to com.nvidia.nvtx.example";
+* nvtxDomainMarkEx(domain, &eventAttrib2);
+* nvtxDomainDestroy(domain);
+* \endcode
+*
+* \sa
+* ::nvtxDomainDestroy
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateA(const char* name);
+NVTX_DECLSPEC nvtxDomainHandle_t NVTX_API nvtxDomainCreateW(const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Unregister a NVTX domain.
+*
+* Unregisters the domain handle and frees all domain specific resources.
+*
+* \param domain    - the domain handle
+*
+* \par Example:
+* \code
+* nvtxDomainHandle_t domain = nvtxDomainCreateA("com.nvidia.nvtx.example");
+* nvtxDomainDestroy(domain);
+* \endcode
+*
+* \sa
+* ::nvtxDomainCreateA
+* ::nvtxDomainCreateW
+*
+* \version \NVTX_VERSION_2
+* @{ */
+NVTX_DECLSPEC void NVTX_API nvtxDomainDestroy(nvtxDomainHandle_t domain);
+/** @} */
+/** @} */ /*END defgroup*/
+/* ========================================================================= */
+/** \cond SHOW_HIDDEN */
+#ifdef UNICODE
+    #define nvtxMark            nvtxMarkW
+    #define nvtxRangeStart      nvtxRangeStartW
+    #define nvtxRangePush       nvtxRangePushW
+    #define nvtxNameCategory    nvtxNameCategoryW
+    #define nvtxNameOsThread    nvtxNameOsThreadW
+    /* NVTX_VERSION_2 */
+    #define nvtxDomainCreate         nvtxDomainCreateW
+    #define nvtxDomainRegisterString nvtxDomainRegisterStringW
+    #define nvtxDomainNameCategory   nvtxDomainNameCategoryW
+#else
+    #define nvtxMark            nvtxMarkA
+    #define nvtxRangeStart      nvtxRangeStartA
+    #define nvtxRangePush       nvtxRangePushA
+    #define nvtxNameCategory    nvtxNameCategoryA
+    #define nvtxNameOsThread    nvtxNameOsThreadA
+    /* NVTX_VERSION_2 */
+    #define nvtxDomainCreate         nvtxDomainCreateA
+    #define nvtxDomainRegisterString nvtxDomainRegisterStringA
+    #define nvtxDomainNameCategory   nvtxDomainNameCategoryA
+#endif
+/** \endcond */
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */
+#define NVTX_IMPL_GUARD /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxTypes.h"
+#ifndef NVTX_NO_IMPL
+#include "nvtxDetail/nvtxImpl.h"
+#endif /*NVTX_NO_IMPL*/
+#undef NVTX_IMPL_GUARD
+#endif /* !defined(NVTX_VERSION) */

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtCuda.h ADDED Viewed

	@@ -0,0 +1,170 @@

+/*
+* Copyright 2009-2016  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+#include "nvToolsExt.h"
+#include "cuda.h"
+#ifndef NVTOOLSEXT_CUDA_V3
+#define NVTOOLSEXT_CUDA_V3
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/* ========================================================================= */
+/** \name Functions for CUDA Resource Naming
+*/
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_CUDA CUDA Resource Naming
+ *
+ * This section covers the API functions that allow to annotate CUDA resources
+ * with user-provided names.
+ *
+ * @{
+ */
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_CUDA  4
+/** \endcond */
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for CUDA
+*/
+typedef enum nvtxResourceCUDAType_t
+{
+    NVTX_RESOURCE_TYPE_CUDA_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDA, 1), /* CUdevice */
+    NVTX_RESOURCE_TYPE_CUDA_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 2), /* CUcontext */
+    NVTX_RESOURCE_TYPE_CUDA_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDA, 3), /* CUstream */
+    NVTX_RESOURCE_TYPE_CUDA_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDA, 4), /* CUevent */
+} nvtxResourceCUDAType_t;
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA device.
+ *
+ * Allows the user to associate a CUDA device with a user-provided name.
+ *
+ * \param device - The handle of the CUDA device to name.
+ * \param name   - The name of the CUDA device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceA(CUdevice device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuDeviceW(CUdevice device, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA context.
+ *
+ * Allows the user to associate a CUDA context with a user-provided name.
+ *
+ * \param context - The handle of the CUDA context to name.
+ * \param name    - The name of the CUDA context.
+ *
+ * \par Example:
+ * \code
+ * CUresult status = cuCtxCreate( &cuContext, 0, cuDevice );
+ * if ( CUDA_SUCCESS != status )
+ *     goto Error;
+ * nvtxNameCuContext(cuContext, "CTX_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextA(CUcontext context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuContextW(CUcontext context, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA stream.
+ *
+ * Allows the user to associate a CUDA stream with a user-provided name.
+ *
+ * \param stream - The handle of the CUDA stream to name.
+ * \param name   - The name of the CUDA stream.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamA(CUstream stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuStreamW(CUstream stream, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA event.
+ *
+ * Allows the user to associate a CUDA event with a user-provided name.
+ *
+ * \param event - The handle of the CUDA event to name.
+ * \param name  - The name of the CUDA event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventA(CUevent event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCuEventW(CUevent event, const wchar_t* name);
+/** @} */
+/** @} */ /* END RESOURCE_NAMING */
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameCuDevice   nvtxNameCuDeviceW
+  #define nvtxNameCuContext  nvtxNameCuContextW
+  #define nvtxNameCuStream   nvtxNameCuStreamW
+  #define nvtxNameCuEvent    nvtxNameCuEventW
+#else
+  #define nvtxNameCuDevice   nvtxNameCuDeviceA
+  #define nvtxNameCuContext  nvtxNameCuContextA
+  #define nvtxNameCuStream   nvtxNameCuStreamA
+  #define nvtxNameCuEvent    nvtxNameCuEventA
+#endif
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_CUDA /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplCuda_v3.h"
+#undef NVTX_IMPL_GUARD_CUDA
+#endif /*NVTX_NO_IMPL*/
+#endif /* NVTOOLSEXT_CUDA_V3 */

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtCudaRt.h ADDED Viewed

	@@ -0,0 +1,146 @@

+/*
+* Copyright 2009-2016  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+#include "nvToolsExt.h"
+#include "cuda.h"
+#include "driver_types.h"
+#ifndef NVTOOLSEXT_CUDART_V3
+#define NVTOOLSEXT_CUDART_V3
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/* ========================================================================= */
+/** \name Functions for CUDA Resource Naming
+*/
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_CUDART CUDA Runtime Resource Naming
+ *
+ * This section covers the API functions that allow to annotate CUDA resources
+ * with user-provided names.
+ *
+ * @{
+ */
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_CUDART 5
+/** \endcond */
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for CUDART
+*/
+typedef enum nvtxResourceCUDARTType_t
+{
+    NVTX_RESOURCE_TYPE_CUDART_DEVICE = NVTX_RESOURCE_MAKE_TYPE(CUDART, 0), /* int device */
+    NVTX_RESOURCE_TYPE_CUDART_STREAM = NVTX_RESOURCE_MAKE_TYPE(CUDART, 1), /* cudaStream_t */
+    NVTX_RESOURCE_TYPE_CUDART_EVENT = NVTX_RESOURCE_MAKE_TYPE(CUDART, 2), /* cudaEvent_t */
+} nvtxResourceCUDARTType_t;
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA device.
+ *
+ * Allows the user to associate a CUDA device with a user-provided name.
+ *
+ * \param device - The id of the CUDA device to name.
+ * \param name   - The name of the CUDA device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceA(int device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaDeviceW(int device, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA stream.
+ *
+ * Allows the user to associate a CUDA stream with a user-provided name.
+ *
+ * \param stream - The handle of the CUDA stream to name.
+ * \param name   - The name of the CUDA stream.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamA(cudaStream_t stream, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaStreamW(cudaStream_t stream, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates a CUDA event.
+ *
+ * Allows the user to associate a CUDA event with a user-provided name.
+ *
+ * \param event - The handle of the CUDA event to name.
+ * \param name  - The name of the CUDA event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventA(cudaEvent_t event, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameCudaEventW(cudaEvent_t event, const wchar_t* name);
+/** @} */
+/** @} */ /* END RESOURCE_NAMING */
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameCudaDevice nvtxNameCudaDeviceW
+  #define nvtxNameCudaStream nvtxNameCudaStreamW
+  #define nvtxNameCudaEvent  nvtxNameCudaEventW
+#else
+  #define nvtxNameCudaDevice nvtxNameCudaDeviceA
+  #define nvtxNameCudaStream nvtxNameCudaStreamA
+  #define nvtxNameCudaEvent  nvtxNameCudaEventA
+#endif
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_CUDART /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplCudaRt_v3.h"
+#undef NVTX_IMPL_GUARD_CUDART
+#endif /*NVTX_NO_IMPL*/
+#endif /* NVTOOLSEXT_CUDART_V3 */

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtOpenCL.h ADDED Viewed

	@@ -0,0 +1,220 @@

+/*
+* Copyright 2009-2016  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+#include "nvToolsExt.h"
+#include <CL/cl.h>
+#ifndef NVTOOLSEXT_OPENCL_V3
+#define NVTOOLSEXT_OPENCL_V3
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/* ========================================================================= */
+/** \name Functions for OpenCL Resource Naming
+ */
+/** \addtogroup RESOURCE_NAMING
+ * \section RESOURCE_NAMING_OPENCL OpenCL Resource Naming
+ *
+ * This section covers the API functions that allow to annotate OpenCL resources
+ * with user-provided names.
+ *
+ * @{
+ */
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_OPENCL 6
+/** \endcond */
+/*  ------------------------------------------------------------------------- */
+/** \brief Resource types for OpenCL
+*/
+typedef enum nvtxResourceOpenCLType_t
+{
+    NVTX_RESOURCE_TYPE_OPENCL_DEVICE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 1),
+    NVTX_RESOURCE_TYPE_OPENCL_CONTEXT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 2),
+    NVTX_RESOURCE_TYPE_OPENCL_COMMANDQUEUE = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 3),
+    NVTX_RESOURCE_TYPE_OPENCL_MEMOBJECT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 4),
+    NVTX_RESOURCE_TYPE_OPENCL_SAMPLER = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 5),
+    NVTX_RESOURCE_TYPE_OPENCL_PROGRAM = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 6),
+    NVTX_RESOURCE_TYPE_OPENCL_EVENT = NVTX_RESOURCE_MAKE_TYPE(OPENCL, 7),
+} nvtxResourceOpenCLType_t;
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL device.
+ *
+ * Allows to associate an OpenCL device with a user-provided name.
+ *
+ * \param device - The handle of the OpenCL device to name.
+ * \param name   - The name of the OpenCL device.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceA(cl_device_id device, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClDeviceW(cl_device_id device, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL context.
+ *
+ * Allows to associate an OpenCL context with a user-provided name.
+ *
+ * \param context - The handle of the OpenCL context to name.
+ * \param name    - The name of the OpenCL context.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextA(cl_context context, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClContextW(cl_context context, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL command queue.
+ *
+ * Allows to associate an OpenCL command queue with a user-provided name.
+ *
+ * \param command_queue - The handle of the OpenCL command queue to name.
+ * \param name          - The name of the OpenCL command queue.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueA(cl_command_queue command_queue, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClCommandQueueW(cl_command_queue command_queue, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL memory object.
+ *
+ * Allows to associate an OpenCL memory object with a user-provided name.
+ *
+ * \param memobj - The handle of the OpenCL memory object to name.
+ * \param name   - The name of the OpenCL memory object.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectA(cl_mem memobj, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClMemObjectW(cl_mem memobj, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL sampler.
+ *
+ * Allows to associate an OpenCL sampler with a user-provided name.
+ *
+ * \param sampler - The handle of the OpenCL sampler to name.
+ * \param name    - The name of the OpenCL sampler.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerA(cl_sampler sampler, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClSamplerW(cl_sampler sampler, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL program.
+ *
+ * Allows to associate an OpenCL program with a user-provided name.
+ *
+ * \param program - The handle of the OpenCL program to name.
+ * \param name    - The name of the OpenCL program.
+ *
+ * \code
+ * cpProgram = clCreateProgramWithSource(cxGPUContext, 1,
+ *     (const char **) &cSourceCL, &program_length, &ciErrNum);
+ * shrCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup);
+ * nvtxNameClProgram(cpProgram, L"PROGRAM_NAME");
+ * \endcode
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramA(cl_program program, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClProgramW(cl_program program, const wchar_t* name);
+/** @} */
+/* ------------------------------------------------------------------------- */
+/** \brief Annotates an OpenCL event.
+ *
+ * Allows to associate an OpenCL event with a user-provided name.
+ *
+ * \param evnt - The handle of the OpenCL event to name.
+ * \param name - The name of the OpenCL event.
+ *
+ * \version \NVTX_VERSION_1
+ * @{ */
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventA(cl_event evnt, const char* name);
+NVTX_DECLSPEC void NVTX_API nvtxNameClEventW(cl_event evnt, const wchar_t* name);
+/** @} */
+/** @} */ /* END RESOURCE_NAMING */
+/* ========================================================================= */
+#ifdef UNICODE
+  #define nvtxNameClDevice        nvtxNameClDeviceW
+  #define nvtxNameClContext       nvtxNameClContextW
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueW
+  #define nvtxNameClMemObject     nvtxNameClMemObjectW
+  #define nvtxNameClSampler       nvtxNameClSamplerW
+  #define nvtxNameClProgram       nvtxNameClProgramW
+  #define nvtxNameClEvent         nvtxNameClEventW
+#else
+  #define nvtxNameClDevice        nvtxNameClDeviceA
+  #define nvtxNameClContext       nvtxNameClContextA
+  #define nvtxNameClCommandQueue  nvtxNameClCommandQueueA
+  #define nvtxNameClMemObject     nvtxNameClMemObjectA
+  #define nvtxNameClSampler       nvtxNameClSamplerA
+  #define nvtxNameClProgram       nvtxNameClProgramA
+  #define nvtxNameClEvent         nvtxNameClEventA
+#endif
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_OPENCL /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplOpenCL_v3.h"
+#undef NVTX_IMPL_GUARD_OPENCL
+#endif /*NVTX_NO_IMPL*/
+#endif /* NVTOOLSEXT_OPENCL_V3 */

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvToolsExtSync.h ADDED Viewed

	@@ -0,0 +1,411 @@

+/*
+* Copyright 2009-2016  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+#include "nvToolsExt.h"
+#ifndef NVTOOLSEXT_SYNC_V3
+#define NVTOOLSEXT_SYNC_V3
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+/* \cond SHOW_HIDDEN
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE ( (uint16_t)( sizeof(nvtxSyncUserAttributes_v0) ) )
+/** \endcond */
+/**
+* \page PAGE_SYNCHRONIZATION Synchronization
+*
+* This section covers a subset of the API that allow users to track additional
+* synchronization details of their application.   Naming OS synchronization primitives
+* may allow users to better understand the data collected by traced synchronization
+* APIs.  Additionally, a user defined synchronization object can allow the users to
+* to tell the tools when the user is building their own synchronization system
+* that do not rely on the OS to provide behaviors and instead use techniques like
+* atomic operations and spinlocks.
+*
+* See module \ref SYNCHRONIZATION for details.
+*
+* \par Example:
+* \code
+* class MyMutex
+* {
+*     volatile long bLocked;
+*     nvtxSyncUser_t hSync;
+* public:
+*     MyMutex(const char* name, nvtxDomainHandle_t d){
+*          bLocked = 0;
+*
+*          nvtxSyncUserAttributes_t attribs = { 0 };
+*          attribs.version = NVTX_VERSION;
+*          attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
+*          attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+*          attribs.message.ascii = name;
+*          hSync = nvtxDomainSyncUserCreate(d, &attribs);
+*     }
+*
+*     ~MyMutex() {
+*          nvtxDomainSyncUserDestroy(hSync);
+*     }
+*
+*     bool Lock() {
+*          nvtxDomainSyncUserAcquireStart(hSync);
+*          bool acquired = __sync_bool_compare_and_swap(&bLocked, 0, 1);//atomic compiler intrinsic
+*          if (acquired) {
+*              nvtxDomainSyncUserAcquireSuccess(hSync);
+*          }
+*          else {
+*              nvtxDomainSyncUserAcquireFailed(hSync);
+*          }
+*          return acquired;
+*     }
+*     void Unlock() {
+*          nvtxDomainSyncUserReleasing(hSync);
+*          bLocked = false;
+*     }
+* };
+* \endcode
+*
+* \version \NVTX_VERSION_2
+*/
+/*  ------------------------------------------------------------------------- */
+/* \cond SHOW_HIDDEN
+* \brief Used to build a non-colliding value for resource types separated class
+* \version \NVTX_VERSION_2
+*/
+#define NVTX_RESOURCE_CLASS_SYNC_OS 2 /**< Synchronization objects that are OS specific. */
+#define NVTX_RESOURCE_CLASS_SYNC_PTHREAD 3 /**< Synchronization objects that are from the POSIX Threads API (pthread)*/
+/** \endcond */
+/*  ------------------------------------------------------------------------- */
+/** \defgroup SYNCHRONIZATION Synchronization
+* See page \ref PAGE_SYNCHRONIZATION.
+* @{
+*/
+/** \brief Resource type values for OSs with POSIX Thread API support
+ */
+typedef enum nvtxResourceSyncPosixThreadType_t
+{
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 1), /* pthread_mutex_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_CONDITION = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 2), /* pthread_cond_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_RWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 3), /* pthread_rwlock_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_BARRIER = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 4), /* pthread_barrier_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 5), /* pthread_spinlock_t  */
+    NVTX_RESOURCE_TYPE_SYNC_PTHREAD_ONCE = NVTX_RESOURCE_MAKE_TYPE(SYNC_PTHREAD, 6) /* pthread_once_t  */
+} nvtxResourceSyncPosixThreadType_t;
+/** \brief Resource type values for Windows OSs
+*/
+typedef enum nvtxResourceSyncWindowsType_t
+{
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_EVENT = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_CRITICAL_SECTION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
+    NVTX_RESOURCE_TYPE_SYNC_WINDOWS_SRWLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5)
+} nvtxResourceSyncWindowsType_t;
+/** \brief Resource type values for Linux and Linux derived OSs such as Android
+* \sa
+* ::nvtxResourceSyncPosixThreadType_t
+*/
+typedef enum nvtxResourceSyncLinuxType_t
+{
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_MUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 1),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_FUTEX = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 2),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_SEMAPHORE = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 3),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_COMPLETION = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 4),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_SPINLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 5),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_SEQLOCK = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 6),
+    NVTX_RESOURCE_TYPE_SYNC_LINUX_RCU = NVTX_RESOURCE_MAKE_TYPE(SYNC_OS, 7)
+} nvtxResourceSyncLinuxType_t;
+/** \brief Resource type values for Android come from Linux.
+* \sa
+* ::nvtxResourceSyncLinuxType_t
+* ::nvtxResourceSyncPosixThreadType_t
+*/
+typedef enum nvtxResourceSyncLinuxType_t nvtxResourceSyncAndroidType_t;
+/** \brief User Defined Synchronization Object Handle .
+* \anchor SYNCUSER_HANDLE_STRUCTURE
+*
+* This structure is opaque to the user and is used as a handle to reference
+* a user defined syncrhonization object.  The tools will return a pointer through the API for the application
+* to hold on it's behalf to reference the string in the future.
+*
+*/
+typedef struct nvtxSyncUser* nvtxSyncUser_t;
+/** \brief User Defined Synchronization Object Attributes Structure.
+* \anchor USERDEF_SYNC_ATTRIBUTES_STRUCTURE
+*
+* This structure is used to describe the attributes of a user defined synchronization
+* object.  The layout of the structure is defined by a specific version of the tools
+* extension library and can change between different versions of the Tools Extension
+* library.
+*
+* \par Initializing the Attributes
+*
+* The caller should always perform the following three tasks when using
+* attributes:
+* <ul>
+*    <li>Zero the structure
+*    <li>Set the version field
+*    <li>Set the size field
+* </ul>
+*
+* Zeroing the structure sets all the event attributes types and values
+* to the default value.
+*
+* The version and size field are used by the Tools Extension
+* implementation to handle multiple versions of the attributes structure.
+*
+* It is recommended that the caller use one of the following to methods
+* to initialize the event attributes structure:
+*
+* \par Method 1: Initializing nvtxEventAttributes for future compatibility
+* \code
+* nvtxSyncUserAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
+* \endcode
+*
+* \par Method 2: Initializing nvtxSyncUserAttributes_t for a specific version
+* \code
+* nvtxSyncUserAttributes_t attribs = {0};
+* attribs.version = 1;
+* attribs.size = (uint16_t)(sizeof(nvtxSyncUserAttributes_t));
+* \endcode
+*
+* If the caller uses Method 1 it is critical that the entire binary
+* layout of the structure be configured to 0 so that all fields
+* are initialized to the default value.
+*
+* The caller should either use both NVTX_VERSION and
+* NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE (Method 1) or use explicit values
+* and a versioned type (Method 2).  Using a mix of the two methods
+* will likely cause either source level incompatibility or binary
+* incompatibility in the future.
+*
+* \par Settings Attribute Types and Values
+*
+*
+* \par Example:
+* \code
+* // Initialize
+* nvtxSyncUserAttributes_t attribs = {0};
+* attribs.version = NVTX_VERSION;
+* attribs.size = NVTX_SYNCUSER_ATTRIB_STRUCT_SIZE;
+*
+* // Configure the Attributes
+* attribs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+* attribs.message.ascii = "Example";
+* \endcode
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+*/
+typedef struct nvtxSyncUserAttributes_v0
+{
+    /**
+    * \brief Version flag of the structure.
+    *
+    * Needs to be set to NVTX_VERSION to indicate the version of NVTX APIs
+    * supported in this header file. This can optionally be overridden to
+    * another version of the tools extension library.
+    */
+    uint16_t version;
+    /**
+    * \brief Size of the structure.
+    *
+    * Needs to be set to the size in bytes of the event attribute
+    * structure used to specify the event.
+    */
+    uint16_t size;
+    /** \brief Message type specified in this attribute structure.
+    *
+    * Defines the message format of the attribute structure's \ref nvtxSyncUserAttributes_v0::message
+    * "message" field.
+    *
+    * Default Value is NVTX_MESSAGE_UNKNOWN
+    */
+    int32_t messageType;            /* nvtxMessageType_t */
+    /** \brief Message assigned to this attribute structure.
+    *
+    * The text message that is attached to an event.
+    */
+    nvtxMessageValue_t message;
+} nvtxSyncUserAttributes_v0;
+typedef struct nvtxSyncUserAttributes_v0 nvtxSyncUserAttributes_t;
+/* ------------------------------------------------------------------------- */
+/** \brief Create a user defined synchronization object
+* This is used to track non-OS synchronization working with spinlocks and atomics
+*
+* \param domain - Domain to own the resource
+* \param attribs - A structure to assign multiple attributes to the object.
+*
+* \return A handle that represents the newly created user defined synchronization object.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC nvtxSyncUser_t NVTX_API nvtxDomainSyncUserCreate(nvtxDomainHandle_t domain, const nvtxSyncUserAttributes_t* attribs);
+/* ------------------------------------------------------------------------- */
+/** \brief Destroy a user defined synchronization object
+* This is used to track non-OS synchronization working with spinlocks and atomics
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserDestroy(nvtxSyncUser_t handle);
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools that an attempt to acquire a user defined synchronization object
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireStart(nvtxSyncUser_t handle);
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools of failure in acquiring a user defined synchronization object
+* This should be called after \ref nvtxDomainSyncUserAcquireStart
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireFailed(nvtxSyncUser_t handle);
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools of success in acquiring a user defined synchronization object
+* This should be called after \ref nvtxDomainSyncUserAcquireStart.
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserAcquireSuccess(nvtxSyncUser_t handle);
+/* ------------------------------------------------------------------------- */
+/** \brief Signal to tools of releasing a reservation on user defined synchronization object
+* This should be called after \ref nvtxDomainSyncUserAcquireSuccess.
+*
+* \param handle - A handle to the object to operate on.
+*
+* \sa
+* ::nvtxDomainSyncUserCreate
+* ::nvtxDomainSyncUserDestroy
+* ::nvtxDomainSyncUserAcquireStart
+* ::nvtxDomainSyncUserAcquireFailed
+* ::nvtxDomainSyncUserAcquireSuccess
+* ::nvtxDomainSyncUserReleasing
+*
+* \version \NVTX_VERSION_2
+*/
+NVTX_DECLSPEC void NVTX_API nvtxDomainSyncUserReleasing(nvtxSyncUser_t handle);
+/** @} */ /*END defgroup*/
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+#ifndef NVTX_NO_IMPL
+#define NVTX_IMPL_GUARD_SYNC /* Ensure other headers cannot included directly */
+#include "nvtxDetail/nvtxImplSync_v3.h"
+#undef NVTX_IMPL_GUARD_SYNC
+#endif /*NVTX_NO_IMPL*/
+#endif /* NVTOOLSEXT_SYNC_V3 */

.venv/lib/python3.11/site-packages/nvidia/nvtx/include/nvtx3/nvtxDetail/nvtxImpl.h ADDED Viewed

	@@ -0,0 +1,469 @@

+/* This file was procedurally generated!  Do not modify this file by hand.  */
+/*
+* Copyright 2009-2016  NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO USER:
+*
+* This source code is subject to NVIDIA ownership rights under U.S. and
+* international Copyright laws.
+*
+* This software and the information contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+* of a form of NVIDIA software license agreement.
+*
+* NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+* CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+* IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+* REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+* MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+* OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+* OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+* OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+* OR PERFORMANCE OF THIS SOURCE CODE.
+*
+* U.S. Government End Users.   This source code is a "commercial item" as
+* that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+* "commercial computer  software"  and "commercial computer software
+* documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+* and is provided to the U.S. Government only as a commercial end item.
+* Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+* 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+* source code with only those rights set forth herein.
+*
+* Any use of this source code in individual and commercial software must
+* include, in the user documentation and internal comments to the code,
+* the above Disclaimer and U.S. Government End Users Notice.
+*/
+#ifndef NVTX_IMPL_GUARD
+#error Never include this file directly -- it is automatically included by nvToolsExt.h (except when NVTX_NO_IMPL is defined).
+#endif
+/* ---- Include required platform headers ---- */
+#if defined(_WIN32)
+#include <Windows.h>
+#else
+#include <unistd.h>
+#if defined(__ANDROID__)
+#include <android/api-level.h>
+#endif
+#if defined(__linux__) || defined(__CYGWIN__)
+#include <sched.h>
+#endif
+#include <limits.h>
+#include <dlfcn.h>
+#include <fcntl.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <pthread.h>
+#include <stdlib.h>
+#include <wchar.h>
+#endif
+/* ---- Define macros used in this file ---- */
+#define NVTX_INIT_STATE_FRESH 0
+#define NVTX_INIT_STATE_STARTED 1
+#define NVTX_INIT_STATE_COMPLETE 2
+#ifdef NVTX_DEBUG_PRINT
+#ifdef __ANDROID__
+#include <android/log.h>
+#define NVTX_ERR(...) __android_log_print(ANDROID_LOG_ERROR, "NVTOOLSEXT", __VA_ARGS__);
+#define NVTX_INFO(...) __android_log_print(ANDROID_LOG_INFO, "NVTOOLSEXT", __VA_ARGS__);
+#else
+#include <stdio.h>
+#define NVTX_ERR(...) fprintf(stderr, "NVTX_ERROR: " __VA_ARGS__)
+#define NVTX_INFO(...) fprintf(stderr, "NVTX_INFO: " __VA_ARGS__)
+#endif
+#else /* !defined(NVTX_DEBUG_PRINT) */
+#define NVTX_ERR(...)
+#define NVTX_INFO(...)
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+#ifdef __GNUC__
+#pragma GCC visibility push(hidden)
+#endif
+/* ---- Forward declare all functions referenced in globals ---- */
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_VERSIONED_IDENTIFIER(nvtxInitOnce)(void);
+NVTX_LINKONCE_FWDDECL_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
+    NvtxCallbackModule module,
+    NvtxFunctionTable* out_table,
+    unsigned int* out_size);
+NVTX_LINKONCE_FWDDECL_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(
+    uint32_t version);
+NVTX_LINKONCE_FWDDECL_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(
+    uint32_t exportTableId);
+#include "nvtxInitDecls.h"
+/* ---- Define all globals ---- */
+typedef struct nvtxGlobals_t
+{
+    volatile unsigned int initState;
+    NvtxExportTableCallbacks etblCallbacks;
+    NvtxExportTableVersionInfo etblVersionInfo;
+    /* Implementation function pointers */
+    nvtxMarkEx_impl_fntype nvtxMarkEx_impl_fnptr;
+    nvtxMarkA_impl_fntype nvtxMarkA_impl_fnptr;
+    nvtxMarkW_impl_fntype nvtxMarkW_impl_fnptr;
+    nvtxRangeStartEx_impl_fntype nvtxRangeStartEx_impl_fnptr;
+    nvtxRangeStartA_impl_fntype nvtxRangeStartA_impl_fnptr;
+    nvtxRangeStartW_impl_fntype nvtxRangeStartW_impl_fnptr;
+    nvtxRangeEnd_impl_fntype nvtxRangeEnd_impl_fnptr;
+    nvtxRangePushEx_impl_fntype nvtxRangePushEx_impl_fnptr;
+    nvtxRangePushA_impl_fntype nvtxRangePushA_impl_fnptr;
+    nvtxRangePushW_impl_fntype nvtxRangePushW_impl_fnptr;
+    nvtxRangePop_impl_fntype nvtxRangePop_impl_fnptr;
+    nvtxNameCategoryA_impl_fntype nvtxNameCategoryA_impl_fnptr;
+    nvtxNameCategoryW_impl_fntype nvtxNameCategoryW_impl_fnptr;
+    nvtxNameOsThreadA_impl_fntype nvtxNameOsThreadA_impl_fnptr;
+    nvtxNameOsThreadW_impl_fntype nvtxNameOsThreadW_impl_fnptr;
+    nvtxNameCuDeviceA_fakeimpl_fntype nvtxNameCuDeviceA_impl_fnptr;
+    nvtxNameCuDeviceW_fakeimpl_fntype nvtxNameCuDeviceW_impl_fnptr;
+    nvtxNameCuContextA_fakeimpl_fntype nvtxNameCuContextA_impl_fnptr;
+    nvtxNameCuContextW_fakeimpl_fntype nvtxNameCuContextW_impl_fnptr;
+    nvtxNameCuStreamA_fakeimpl_fntype nvtxNameCuStreamA_impl_fnptr;
+    nvtxNameCuStreamW_fakeimpl_fntype nvtxNameCuStreamW_impl_fnptr;
+    nvtxNameCuEventA_fakeimpl_fntype nvtxNameCuEventA_impl_fnptr;
+    nvtxNameCuEventW_fakeimpl_fntype nvtxNameCuEventW_impl_fnptr;
+    nvtxNameClDeviceA_fakeimpl_fntype nvtxNameClDeviceA_impl_fnptr;
+    nvtxNameClDeviceW_fakeimpl_fntype nvtxNameClDeviceW_impl_fnptr;
+    nvtxNameClContextA_fakeimpl_fntype nvtxNameClContextA_impl_fnptr;
+    nvtxNameClContextW_fakeimpl_fntype nvtxNameClContextW_impl_fnptr;
+    nvtxNameClCommandQueueA_fakeimpl_fntype nvtxNameClCommandQueueA_impl_fnptr;
+    nvtxNameClCommandQueueW_fakeimpl_fntype nvtxNameClCommandQueueW_impl_fnptr;
+    nvtxNameClMemObjectA_fakeimpl_fntype nvtxNameClMemObjectA_impl_fnptr;
+    nvtxNameClMemObjectW_fakeimpl_fntype nvtxNameClMemObjectW_impl_fnptr;
+    nvtxNameClSamplerA_fakeimpl_fntype nvtxNameClSamplerA_impl_fnptr;
+    nvtxNameClSamplerW_fakeimpl_fntype nvtxNameClSamplerW_impl_fnptr;
+    nvtxNameClProgramA_fakeimpl_fntype nvtxNameClProgramA_impl_fnptr;
+    nvtxNameClProgramW_fakeimpl_fntype nvtxNameClProgramW_impl_fnptr;
+    nvtxNameClEventA_fakeimpl_fntype nvtxNameClEventA_impl_fnptr;
+    nvtxNameClEventW_fakeimpl_fntype nvtxNameClEventW_impl_fnptr;
+    nvtxNameCudaDeviceA_impl_fntype nvtxNameCudaDeviceA_impl_fnptr;
+    nvtxNameCudaDeviceW_impl_fntype nvtxNameCudaDeviceW_impl_fnptr;
+    nvtxNameCudaStreamA_fakeimpl_fntype nvtxNameCudaStreamA_impl_fnptr;
+    nvtxNameCudaStreamW_fakeimpl_fntype nvtxNameCudaStreamW_impl_fnptr;
+    nvtxNameCudaEventA_fakeimpl_fntype nvtxNameCudaEventA_impl_fnptr;
+    nvtxNameCudaEventW_fakeimpl_fntype nvtxNameCudaEventW_impl_fnptr;
+    nvtxDomainMarkEx_impl_fntype nvtxDomainMarkEx_impl_fnptr;
+    nvtxDomainRangeStartEx_impl_fntype nvtxDomainRangeStartEx_impl_fnptr;
+    nvtxDomainRangeEnd_impl_fntype nvtxDomainRangeEnd_impl_fnptr;
+    nvtxDomainRangePushEx_impl_fntype nvtxDomainRangePushEx_impl_fnptr;
+    nvtxDomainRangePop_impl_fntype nvtxDomainRangePop_impl_fnptr;
+    nvtxDomainResourceCreate_impl_fntype nvtxDomainResourceCreate_impl_fnptr;
+    nvtxDomainResourceDestroy_impl_fntype nvtxDomainResourceDestroy_impl_fnptr;
+    nvtxDomainNameCategoryA_impl_fntype nvtxDomainNameCategoryA_impl_fnptr;
+    nvtxDomainNameCategoryW_impl_fntype nvtxDomainNameCategoryW_impl_fnptr;
+    nvtxDomainRegisterStringA_impl_fntype nvtxDomainRegisterStringA_impl_fnptr;
+    nvtxDomainRegisterStringW_impl_fntype nvtxDomainRegisterStringW_impl_fnptr;
+    nvtxDomainCreateA_impl_fntype nvtxDomainCreateA_impl_fnptr;
+    nvtxDomainCreateW_impl_fntype nvtxDomainCreateW_impl_fnptr;
+    nvtxDomainDestroy_impl_fntype nvtxDomainDestroy_impl_fnptr;
+    nvtxInitialize_impl_fntype nvtxInitialize_impl_fnptr;
+    nvtxDomainSyncUserCreate_impl_fntype nvtxDomainSyncUserCreate_impl_fnptr;
+    nvtxDomainSyncUserDestroy_impl_fntype nvtxDomainSyncUserDestroy_impl_fnptr;
+    nvtxDomainSyncUserAcquireStart_impl_fntype nvtxDomainSyncUserAcquireStart_impl_fnptr;
+    nvtxDomainSyncUserAcquireFailed_impl_fntype nvtxDomainSyncUserAcquireFailed_impl_fnptr;
+    nvtxDomainSyncUserAcquireSuccess_impl_fntype nvtxDomainSyncUserAcquireSuccess_impl_fnptr;
+    nvtxDomainSyncUserReleasing_impl_fntype nvtxDomainSyncUserReleasing_impl_fnptr;
+    /* Tables of function pointers -- Extra null added to the end to ensure
+    *  a crash instead of silent corruption if a tool reads off the end. */
+    NvtxFunctionPointer* functionTable_CORE  [NVTX_CBID_CORE_SIZE   + 1];
+    NvtxFunctionPointer* functionTable_CUDA  [NVTX_CBID_CUDA_SIZE   + 1];
+    NvtxFunctionPointer* functionTable_OPENCL[NVTX_CBID_OPENCL_SIZE + 1];
+    NvtxFunctionPointer* functionTable_CUDART[NVTX_CBID_CUDART_SIZE + 1];
+    NvtxFunctionPointer* functionTable_CORE2 [NVTX_CBID_CORE2_SIZE  + 1];
+    NvtxFunctionPointer* functionTable_SYNC  [NVTX_CBID_SYNC_SIZE   + 1];
+} nvtxGlobals_t;
+NVTX_LINKONCE_DEFINE_GLOBAL nvtxGlobals_t NVTX_VERSIONED_IDENTIFIER(nvtxGlobals) =
+{
+    NVTX_INIT_STATE_FRESH,
+    {
+        sizeof(NvtxExportTableCallbacks),
+        NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)
+    },
+    {
+        sizeof(NvtxExportTableVersionInfo),
+        NVTX_VERSION,
+        0,
+        NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)
+    },
+    /* Implementation function pointers */
+    NVTX_VERSIONED_IDENTIFIER(nvtxMarkEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxMarkA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxMarkW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeStartW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangeEnd_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePushEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePushA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePushW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxRangePop_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCategoryW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameOsThreadW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuDeviceW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuContextW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuStreamW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCuEventW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClDeviceW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClContextW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClCommandQueueW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClMemObjectW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClSamplerW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClProgramW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameClEventW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaDeviceW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaStreamW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxNameCudaEventW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainMarkEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeStartEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangeEnd_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePushEx_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRangePop_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceCreate_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainResourceDestroy_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainNameCategoryW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainRegisterStringW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateA_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainCreateW_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainDestroy_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxInitialize_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserCreate_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserDestroy_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireStart_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireFailed_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserAcquireSuccess_impl_init),
+    NVTX_VERSIONED_IDENTIFIER(nvtxDomainSyncUserReleasing_impl_init),
+    /* Tables of function pointers */
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxMarkW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeStartW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangeEnd_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePushW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxRangePop_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCategoryW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameOsThreadW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuDeviceW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuContextW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuStreamW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCuEventW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClDeviceW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClContextW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClCommandQueueW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClMemObjectW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClSamplerW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClProgramW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameClEventW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaDeviceW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaStreamW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxNameCudaEventW_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainMarkEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeStartEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangeEnd_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePushEx_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRangePop_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceCreate_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainResourceDestroy_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainNameCategoryW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainRegisterStringW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateA_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainCreateW_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainDestroy_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxInitialize_impl_fnptr,
+        0
+    },
+    {
+        0,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserCreate_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserDestroy_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireStart_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireFailed_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserAcquireSuccess_impl_fnptr,
+        (NvtxFunctionPointer*)&NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).nvtxDomainSyncUserReleasing_impl_fnptr,
+        0
+    }
+};
+/* ---- Define static inline implementations of core API functions ---- */
+#include "nvtxImplCore.h"
+/* ---- Define implementations of export table functions ---- */
+NVTX_LINKONCE_DEFINE_FUNCTION int NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiGetModuleFunctionTable)(
+    NvtxCallbackModule module,
+    NvtxFunctionTable* out_table,
+    unsigned int* out_size)
+{
+    unsigned int bytes = 0;
+    NvtxFunctionTable table = (NvtxFunctionTable)0;
+    switch (module)
+    {
+    case NVTX_CB_MODULE_CORE:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE);
+        break;
+    case NVTX_CB_MODULE_CUDA:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDA);
+        break;
+    case NVTX_CB_MODULE_OPENCL:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_OPENCL);
+        break;
+    case NVTX_CB_MODULE_CUDART:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CUDART);
+        break;
+    case NVTX_CB_MODULE_CORE2:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_CORE2);
+        break;
+    case NVTX_CB_MODULE_SYNC:
+        table = NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC;
+        bytes = (unsigned int)sizeof(NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).functionTable_SYNC);
+        break;
+    default: return 0;
+    }
+    if (out_size)
+        *out_size = (bytes / (unsigned int)sizeof(NvtxFunctionPointer*)) - 1;
+    if (out_table)
+        *out_table = table;
+    return 1;
+}
+NVTX_LINKONCE_DEFINE_FUNCTION const void* NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxGetExportTable)(uint32_t exportTableId)
+{
+    switch (exportTableId)
+    {
+    case NVTX_ETID_CALLBACKS:       return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblCallbacks;
+    case NVTX_ETID_VERSIONINFO:     return &NVTX_VERSIONED_IDENTIFIER(nvtxGlobals).etblVersionInfo;
+    default:                        return 0;
+    }
+}
+NVTX_LINKONCE_DEFINE_FUNCTION void NVTX_API NVTX_VERSIONED_IDENTIFIER(nvtxEtiSetInjectionNvtxVersion)(uint32_t version)
+{
+    /* Reserved for custom implementations to resolve problems with tools */
+    (void)version;
+}
+/* ---- Define implementations of init versions of all API functions ---- */
+#include "nvtxInitDefs.h"
+/* ---- Define implementations of initialization functions ---- */
+#include "nvtxInit.h"
+#ifdef __GNUC__
+#pragma GCC visibility pop
+#endif
+#ifdef __cplusplus
+} /* extern "C" */
+#endif /* __cplusplus */