BryanW commited on Mar 23

Commit

76cbda0

verified ·

1 Parent(s): 6954e2b

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/common_functions.h +65 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.h +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.hpp +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_egl_interop.h +645 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp16.h +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.h +475 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_occupancy.h +2094 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline_primitives.h +148 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti.h +123 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_events.h +1349 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pcsampling.h +936 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_runtime_cbid.h +504 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_functions.h +65 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/library_types.h +111 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_cuda_host.h +179 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.hpp +588 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.h +330 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.hpp +161 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/__init__.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/convert.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/pack.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/tags.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/unpack.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/__pycache__/__init__.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/LICENSE +3 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/LICENSE.APACHE +177 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/LICENSE.BSD +23 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__init__.py +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/__init__.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_elffile.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_manylinux.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_musllinux.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_parser.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_structures.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_tokenizer.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/markers.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/requirements.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/specifiers.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/tags.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/utils.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/version.cpython-312.pyc +0 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_elffile.py +108 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_musllinux.py +83 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_parser.py +356 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_structures.py +61 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_tokenizer.py +192 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/markers.py +253 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/specifiers.py +1011 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/tags.py +571 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/utils.py +172 -0

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/common_functions.h ADDED Viewed

	@@ -0,0 +1,65 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("common_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "common_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
+#endif
+#include "crt/common_functions.h"
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
+#endif

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.h ADDED Viewed

The diff for this file is too large to render. See raw diff

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.hpp ADDED Viewed

The diff for this file is too large to render. See raw diff

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_egl_interop.h ADDED Viewed

	@@ -0,0 +1,645 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_EGL_INTEROP_H__)
+#define __CUDA_EGL_INTEROP_H__
+#include "cuda_runtime_api.h"
+#include "cuda_runtime.h"
+#include "cudart_platform.h"
+#include "EGL/egl.h"
+#include "EGL/eglext.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+/**
+ * \addtogroup CUDART_TYPES
+ * @{
+ */
+ /**
+ * Maximum number of planes per frame
+ */
+#define CUDA_EGL_MAX_PLANES 3
+/**
+ * CUDA EglFrame type - array or pointer
+ */
+typedef enum cudaEglFrameType_enum
+{
+    cudaEglFrameTypeArray = 0,  /**< Frame type CUDA array */
+    cudaEglFrameTypePitch = 1,  /**< Frame type CUDA pointer */
+} cudaEglFrameType;
+/**
+ * Resource location flags- sysmem or vidmem
+ *
+ * For CUDA context on iGPU, since video and system memory are equivalent -
+ * these flags will not have an effect on the execution.
+ *
+ * For CUDA context on dGPU, applications can use the flag ::cudaEglResourceLocationFlags
+ * to give a hint about the desired location.
+ *
+ * ::cudaEglResourceLocationSysmem - the frame data is made resident on the system memory
+ * to be accessed by CUDA.
+ *
+ * ::cudaEglResourceLocationVidmem - the frame data is made resident on the dedicated
+ * video memory to be accessed by CUDA.
+ *
+ * There may be an additional latency due to new allocation and data migration,
+ * if the frame is produced on a different memory.
+ */
+typedef enum cudaEglResourceLocationFlags_enum {
+    cudaEglResourceLocationSysmem   = 0x00,       /**< Resource location sysmem */
+    cudaEglResourceLocationVidmem   = 0x01,       /**< Resource location vidmem */
+} cudaEglResourceLocationFlags;
+/**
+ * CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
+ */
+typedef enum cudaEglColorFormat_enum {
+    cudaEglColorFormatYUV420Planar            = 0,  /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420SemiPlanar        = 1,  /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
+    cudaEglColorFormatYUV422Planar            = 2,  /**< Y, U, V  each in a separate  surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV422SemiPlanar        = 3,  /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
+    cudaEglColorFormatARGB                    = 6,  /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
+    cudaEglColorFormatRGBA                    = 7,  /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
+    cudaEglColorFormatL                       = 8,  /**< single luminance channel in one surface. */
+    cudaEglColorFormatR                       = 9,  /**< single color channel in one surface. */
+    cudaEglColorFormatYUV444Planar            = 10, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV444SemiPlanar        = 11, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
+    cudaEglColorFormatYUYV422                 = 12, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    cudaEglColorFormatUYVY422                 = 13, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
+    cudaEglColorFormatABGR                    = 14, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
+    cudaEglColorFormatBGRA                    = 15, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
+    cudaEglColorFormatA                       = 16, /**< Alpha color format - one channel in one surface. */
+    cudaEglColorFormatRG                      = 17, /**< R/G color format - two channels in one surface with GR byte ordering */
+    cudaEglColorFormatAYUV                    = 18, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    cudaEglColorFormatYVU444SemiPlanar        = 19, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422SemiPlanar        = 20, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420SemiPlanar        = 21, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_444SemiPlanar = 22, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar = 23, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY12V12U12_444SemiPlanar = 24, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY12V12U12_420SemiPlanar = 25, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatVYUY_ER                 = 26, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
+    cudaEglColorFormatUYVY_ER                 = 27, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
+    cudaEglColorFormatYUYV_ER                 = 28, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
+    cudaEglColorFormatYVYU_ER                 = 29, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
+    cudaEglColorFormatYUVA_ER                 = 31, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    cudaEglColorFormatAYUV_ER                 = 32, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
+    cudaEglColorFormatYUV444Planar_ER         = 33, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV422Planar_ER         = 34, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV420Planar_ER         = 35, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV444SemiPlanar_ER     = 36, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV422SemiPlanar_ER     = 37, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYUV420SemiPlanar_ER     = 38, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU444Planar_ER         = 39, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422Planar_ER         = 40, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420Planar_ER         = 41, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU444SemiPlanar_ER     = 42, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422SemiPlanar_ER     = 43, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420SemiPlanar_ER     = 44, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatBayerRGGB               = 45, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
+    cudaEglColorFormatBayerBGGR               = 46, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
+    cudaEglColorFormatBayerGRBG               = 47, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
+    cudaEglColorFormatBayerGBRG               = 48, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
+    cudaEglColorFormatBayer10RGGB             = 49, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer10BGGR             = 50, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer10GRBG             = 51, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer10GBRG             = 52, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer12RGGB             = 53, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12BGGR             = 54, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12GRBG             = 55, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12GBRG             = 56, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer14RGGB             = 57, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer14BGGR             = 58, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer14GRBG             = 59, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer14GBRG             = 60, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
+    cudaEglColorFormatBayer20RGGB             = 61, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatBayer20BGGR             = 62, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatBayer20GRBG             = 63, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatBayer20GBRG             = 64, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
+    cudaEglColorFormatYVU444Planar            = 65, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU422Planar            = 66, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
+    cudaEglColorFormatYVU420Planar            = 67, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatBayerIspRGGB            = 68, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerIspBGGR            = 69, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerIspGRBG            = 70, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerIspGBRG            = 71, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
+    cudaEglColorFormatBayerBCCR               = 72, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
+    cudaEglColorFormatBayerRCCB               = 73, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
+    cudaEglColorFormatBayerCRBC               = 74, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
+    cudaEglColorFormatBayerCBRC               = 75, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
+    cudaEglColorFormatBayer10CCCC             = 76, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
+    cudaEglColorFormatBayer12BCCR             = 77, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12RCCB             = 78, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12CRBC             = 79, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12CBRC             = 80, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatBayer12CCCC             = 81, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
+    cudaEglColorFormatY                       = 82, /**< Color format for single Y plane. */
+    cudaEglColorFormatYUV420SemiPlanar_2020   = 83, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420SemiPlanar_2020   = 84, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420Planar_2020       = 85, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420Planar_2020       = 86, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420SemiPlanar_709    = 87, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420SemiPlanar_709    = 88, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYUV420Planar_709        = 89, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatYVU420Planar_709        = 90, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_709  = 91, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_2020 = 92, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_422SemiPlanar_2020 = 93, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
+    cudaEglColorFormatY10V10U10_422SemiPlanar      = 94, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
+    cudaEglColorFormatY10V10U10_422SemiPlanar_709  = 95, /**< Y10, V10U10  in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height =  Y height. */
+    cudaEglColorFormatY_ER                         = 96, /**< Extended Range Color format for single Y plane. */
+    cudaEglColorFormatY_709_ER                     = 97, /**< Extended Range Color format for single Y plane. */
+    cudaEglColorFormatY10_ER                       = 98, /**< Extended Range Color format for single Y10 plane. */
+    cudaEglColorFormatY10_709_ER                   = 99, /**< Extended Range Color format for single Y10 plane. */
+    cudaEglColorFormatY12_ER                       = 100, /**< Extended Range Color format for single Y12 plane. */
+    cudaEglColorFormatY12_709_ER                   = 101, /**< Extended Range Color format for single Y12 plane. */
+    cudaEglColorFormatYUVA                         = 102, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
+    cudaEglColorFormatYVYU                         = 104, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
+    cudaEglColorFormatVYUY                         = 105, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_ER     = 106, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER = 107, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY10V10U10_444SemiPlanar_ER     = 108, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER = 109, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY12V12U12_420SemiPlanar_ER     = 110, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER = 111, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
+    cudaEglColorFormatY12V12U12_444SemiPlanar_ER     = 112, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER = 113, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
+    cudaEglColorFormatUYVY709                        = 114, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+    cudaEglColorFormatUYVY709_ER                     = 115, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
+    cudaEglColorFormatUYVY2020                       = 116,  /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
+} cudaEglColorFormat;
+/**
+ * CUDA EGL Plane Descriptor - structure defining each plane of a CUDA EGLFrame
+ */
+typedef struct cudaEglPlaneDesc_st {
+    unsigned int width;                         /**< Width of plane */
+    unsigned int height;                        /**< Height of plane */
+    unsigned int depth;                         /**< Depth of plane */
+    unsigned int pitch;                         /**< Pitch of plane */
+    unsigned int numChannels;                   /**< Number of channels for the plane */
+    struct cudaChannelFormatDesc channelDesc;   /**< Channel Format Descriptor */
+    unsigned int reserved[4];                   /**< Reserved for future use */
+} cudaEglPlaneDesc;
+/**
+ * CUDA EGLFrame Descriptor - structure defining one frame of EGL.
+ *
+ * Each frame may contain one or more planes depending on whether the surface is Multiplanar or not.
+ * Each plane of EGLFrame is represented by ::cudaEglPlaneDesc which is defined as:
+ * \code
+ * typedef struct cudaEglPlaneDesc_st {
+ *     unsigned int width;
+ *     unsigned int height;
+ *     unsigned int depth;
+ *     unsigned int pitch;
+ *     unsigned int numChannels;
+ *     struct cudaChannelFormatDesc channelDesc;
+ *     unsigned int reserved[4];
+ * } cudaEglPlaneDesc;
+ * \endcode
+*/
+typedef struct cudaEglFrame_st {
+   union {
+       cudaArray_t            pArray[CUDA_EGL_MAX_PLANES];     /**< Array of CUDA arrays corresponding to each plane*/
+       struct cudaPitchedPtr  pPitch[CUDA_EGL_MAX_PLANES];     /**< Array of Pointers corresponding to each plane*/
+   } frame;
+   cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];     /**< CUDA EGL Plane Descriptor ::cudaEglPlaneDesc*/
+   unsigned int planeCount;                             /**< Number of planes */
+   cudaEglFrameType frameType;                          /**< Array or Pitch */
+   cudaEglColorFormat eglColorFormat;                   /**< CUDA EGL Color Format*/
+} cudaEglFrame;
+/**
+ * CUDA EGLSream Connection
+ */
+typedef struct  CUeglStreamConnection_st *cudaEglStreamConnection;
+/** @} */ /* END CUDART_TYPES */
+/**
+ * \addtogroup CUDART_EGL EGL Interoperability
+ * This section describes the EGL interoperability functions of the CUDA
+ * runtime application programming interface.
+ *
+ * @{
+ */
+/**
+ * \brief Registers an EGL image
+ *
+ * Registers the EGLImageKHR specified by \p image for access by
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
+ * Additional Mapping/Unmapping is not required for the registered resource and
+ * ::cudaGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
+ *
+ * The application will be responsible for synchronizing access to shared objects.
+ * The application must ensure that any pending operation which access the objects have completed
+ * before passing control to CUDA. This may be accomplished by issuing and waiting for
+ * glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
+ * The application will be also responsible for ensuring that any pending operation on the
+ * registered CUDA resource has completed prior to executing subsequent commands in other APIs
+ * accesing the same memory objects.
+ * This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
+ *
+ * The surface's intended usage is specified using \p flags, as follows:
+ *
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
+ *   resource will be used. It is therefore assumed that this resource will be
+ *   read from and written to by CUDA. This is the default value.
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
+ *   will not write to this resource.
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
+ *   CUDA will not read from this resource and will write over the
+ *   entire contents of the resource, so none of the data previously
+ *   stored in the resource will be preserved.
+ *
+ * The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
+ * typedef void* EGLImageKHR
+ *
+ * \param pCudaResource   - Pointer to the returned object handle
+ * \param image           - An EGLImageKHR image which can be used to create target resource.
+ * \param flags           - Map flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidResourceHandle,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaGraphicsUnregisterResource,
+ * ::cudaGraphicsResourceGetMappedEglFrame,
+ * ::cuGraphicsEGLRegisterImage
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsEGLRegisterImage(struct cudaGraphicsResource **pCudaResource, EGLImageKHR image, unsigned int flags);
+/**
+ * \brief Connect CUDA to EGLStream as a consumer.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p eglStream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn              - Pointer to the returned connection handle
+ * \param eglStream         - EGLStreamKHR handle
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerConnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnect(cudaEglStreamConnection *conn, EGLStreamKHR eglStream);
+/**
+ * \brief Connect CUDA to EGLStream as a consumer with given flags.
+ *
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by
+ * ::cudaEglResourceLocationFlags.
+ *
+ * The flags specify whether the consumer wants to access frames from system memory or video memory.
+ * Default is ::cudaEglResourceLocationVidmem.
+ *
+ * \param conn              - Pointer to the returned connection handle
+ * \param eglStream         - EGLStreamKHR handle
+ * \param flags             - Flags denote intended location - system or video.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerConnectWithFlags
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnectWithFlags(cudaEglStreamConnection *conn, EGLStreamKHR eglStream, unsigned int flags);
+/**
+ * \brief Disconnect CUDA as a consumer to EGLStream .
+ *
+ * Disconnect CUDA as a consumer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerConnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerDisconnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerDisconnect(cudaEglStreamConnection *conn);
+/**
+ * \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
+ *
+ * Acquire an image frame from EGLStreamKHR.
+ * ::cudaGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
+ * ::cudaEglFrame.
+ *
+ * \param conn            - Connection on which to acquire
+ * \param pCudaResource   - CUDA resource on which the EGLStream frame will be mapped for use.
+ * \param pStream         - CUDA stream for synchronization and any data migrations
+ * implied by ::cudaEglResourceLocationFlags.
+ * \param timeout         - Desired timeout in usec.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * ::cudaErrorLaunchTimeout
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerConnect,
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerReleaseFrame,
+ * ::cuEGLStreamConsumerAcquireFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerAcquireFrame(cudaEglStreamConnection *conn,
+        cudaGraphicsResource_t *pCudaResource, cudaStream_t *pStream, unsigned int timeout);
+/**
+ * \brief Releases the last frame acquired from the EGLStream.
+ *
+ * Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
+ *
+ * \param conn            - Connection on which to release
+ * \param pCudaResource   - CUDA resource whose corresponding frame is to be released
+ * \param pStream         - CUDA stream on which release will be done.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamConsumerConnect,
+ * ::cudaEGLStreamConsumerDisconnect,
+ * ::cudaEGLStreamConsumerAcquireFrame,
+ * ::cuEGLStreamConsumerReleaseFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerReleaseFrame(cudaEglStreamConnection *conn,
+                                                  cudaGraphicsResource_t pCudaResource, cudaStream_t *pStream);
+/**
+ * \brief Connect CUDA to EGLStream as a producer.
+ *
+ * Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
+ *
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
+ * API to another.
+ *
+ * \param conn   - Pointer to the returned connection handle
+ * \param eglStream - EGLStreamKHR handle
+ * \param width  - width of the image to be submitted to the stream
+ * \param height - height of the image to be submitted to the stream
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerDisconnect,
+ * ::cudaEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerReturnFrame,
+ * ::cuEGLStreamProducerConnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerConnect(cudaEglStreamConnection *conn,
+                                                EGLStreamKHR eglStream, EGLint width, EGLint height);
+/**
+ * \brief Disconnect CUDA as a producer  to EGLStream .
+ *
+ * Disconnect CUDA as a producer to EGLStreamKHR.
+ *
+ * \param conn            - Conection to disconnect.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerConnect,
+ * ::cudaEGLStreamProducerPresentFrame,
+ * ::cudaEGLStreamProducerReturnFrame,
+ * ::cuEGLStreamProducerDisconnect
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerDisconnect(cudaEglStreamConnection *conn);
+/**
+ * \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
+ *
+ * The ::cudaEglFrame is defined as:
+ * \code
+ * typedef struct cudaEglFrame_st {
+ *     union {
+ *         cudaArray_t            pArray[CUDA_EGL_MAX_PLANES];
+ *         struct cudaPitchedPtr  pPitch[CUDA_EGL_MAX_PLANES];
+ *     } frame;
+ *     cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
+ *     unsigned int planeCount;
+ *     cudaEglFrameType frameType;
+ *     cudaEglColorFormat eglColorFormat;
+ * } cudaEglFrame;
+ * \endcode
+ *
+ * For ::cudaEglFrame of type ::cudaEglFrameTypePitch, the application may present sub-region of a memory
+ * allocation. In that case, ::cudaPitchedPtr::ptr will specify the start address of the sub-region in
+ * the allocation and ::cudaEglPlaneDesc will specify the dimensions of the sub-region.
+ *
+ * \param conn            - Connection on which to present the CUDA array
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to present the frame.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerConnect,
+ * ::cudaEGLStreamProducerDisconnect,
+ * ::cudaEGLStreamProducerReturnFrame,
+ * ::cuEGLStreamProducerPresentFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection *conn,
+                                                 cudaEglFrame eglframe, cudaStream_t *pStream);
+/**
+ * \brief Return the CUDA eglFrame to the EGLStream last released by the consumer.
+ *
+ * This API can potentially return cudaErrorLaunchTimeout if the consumer has not
+ * returned a frame to EGL stream. If timeout is returned the application can retry.
+ *
+ * \param conn            - Connection on which to present the CUDA array
+ * \param eglframe        - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
+ * \param pStream         - CUDA stream on which to return the frame.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \sa
+ * ::cudaEGLStreamProducerConnect,
+ * ::cudaEGLStreamProducerDisconnect,
+ * ::cudaEGLStreamProducerPresentFrame,
+ * ::cuEGLStreamProducerReturnFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection *conn,
+                                                cudaEglFrame *eglframe, cudaStream_t *pStream);
+/**
+ * \brief Get an eglFrame through which to access a registered EGL graphics resource.
+ *
+ * Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
+ * \p resource may be accessed.
+ * This API can only be called for EGL graphics resources.
+ *
+ * The ::cudaEglFrame is defined as
+ * \code
+ * typedef struct cudaEglFrame_st {
+ *     union {
+ *         cudaArray_t             pArray[CUDA_EGL_MAX_PLANES];
+ *         struct cudaPitchedPtr   pPitch[CUDA_EGL_MAX_PLANES];
+ *     } frame;
+ *     cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
+ *     unsigned int planeCount;
+ *     cudaEglFrameType frameType;
+ *     cudaEglColorFormat eglColorFormat;
+ * } cudaEglFrame;
+ * \endcode
+ *
+ *
+ * \param eglFrame   - Returned eglFrame.
+ * \param resource   - Registered resource to access.
+ * \param index      - Index for cubemap surfaces.
+ * \param mipLevel   - Mipmap level for the subresource to access.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown
+ *
+ * \note Note that in case of multiplanar \p *eglFrame, pitch of only first plane (unsigned int cudaEglPlaneDesc::pitch) is to be considered by the application.
+ *
+ * \sa
+ * ::cudaGraphicsSubResourceGetMappedArray,
+ * ::cudaGraphicsResourceGetMappedPointer,
+ * ::cuGraphicsResourceGetMappedEglFrame
+ */
+extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame,
+                                        cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel);
+/**
+ * \brief Creates an event from EGLSync object
+ *
+ * Creates an event *phEvent from an EGLSyncKHR eglSync with the flages specified
+ * via \p flags. Valid flags include:
+ * - ::cudaEventDefault: Default event creation flag.
+ * - ::cudaEventBlockingSync: Specifies that the created event should use blocking
+ * synchronization.  A CPU thread that uses ::cudaEventSynchronize() to wait on
+ * an event created with this flag will block until the event has actually
+ * been completed.
+ *
+ * ::cudaEventRecord and TimingData are not supported for events created from EGLSync.
+ *
+ * The EGLSyncKHR is an opaque handle to an EGL sync object.
+ * typedef void* EGLSyncKHR
+ *
+ * \param phEvent - Returns newly created event
+ * \param eglSync - Opaque handle to EGLSync object
+ * \param flags   - Event creation flags
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInitializationError,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ *
+ * \sa
+ * ::cudaEventQuery,
+ * ::cudaEventSynchronize,
+ * ::cudaEventDestroy
+ */
+extern __host__ cudaError_t CUDARTAPI cudaEventCreateFromEGLSync(cudaEvent_t *phEvent, EGLSyncKHR eglSync, unsigned int flags);
+/** @} */ /* END CUDART_EGL */
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#endif /* __CUDA_EGL_INTEROP_H__ */

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp16.h ADDED Viewed

The diff for this file is too large to render. See raw diff

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.h ADDED Viewed

	@@ -0,0 +1,475 @@

+/*
+ * Copyright 2022-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef __CUDA_FP8_H__
+#define __CUDA_FP8_H__
+/* Set up function decorations */
+#if defined(__CUDACC__)
+#define __CUDA_FP8_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_FP8__ __host__ __device__
+#define __CUDA_HOSTDEVICE_FP8_DECL__ static __host__ __device__ __inline__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_FP8_DECL__ static __attribute__((unused))
+#else
+#define __CUDA_HOSTDEVICE_FP8_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE_FP8__
+#endif /* defined(__CUDACC_) */
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP8
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#define __CPP_VERSION_AT_LEAST_11_FP8
+#endif
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+/* bring in enum cudaRoundMode */
+#include "device_types.h"
+#endif  /* !defined(__CUDACC_RTC__) */
+/* bring in __half_raw data type */
+#include "cuda_fp16.h"
+/* bring in __nv_bfloat16_raw data type */
+#include "cuda_bf16.h"
+// implicitly provided by NVRTC
+#if !defined(__CUDACC_RTC__)
+/* bring in float2, double4, etc vector types */
+#include "vector_types.h"
+#endif  /* !defined(__CUDACC_RTC__) */
+/**
+ * \defgroup CUDA_MATH_INTRINSIC_FP8 FP8 Intrinsics
+ * This section describes fp8 intrinsic functions.
+ * To use these functions, include the header file \p cuda_fp8.h in your
+ * program.
+ * The following macros are available to help users selectively enable/disable
+ * various definitions present in the header file:
+ * - \p __CUDA_NO_FP8_CONVERSIONS__ - If defined, this macro will prevent any
+ * use of the C++ type conversions (converting constructors and conversion
+ * operators) defined in the header.
+ * - \p __CUDA_NO_FP8_CONVERSION_OPERATORS__ - If defined, this macro will
+ * prevent any use of the  C++ conversion operators from \p fp8 to other types.
+ */
+/**
+ * \defgroup CUDA_MATH_FP8_MISC FP8 Conversion and Data Movement
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
+ * To use these functions, include the header file \p cuda_fp8.h in your
+ * program.
+ */
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief 8-bit \p unsigned \p integer
+ * type abstraction used for \p fp8 floating-point
+ * numbers storage.
+ */
+typedef unsigned char __nv_fp8_storage_t;
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief 16-bit \p unsigned \p integer
+ * type abstraction used for storage of pairs of
+ * \p fp8 floating-point numbers.
+ */
+typedef unsigned short int __nv_fp8x2_storage_t;
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief 32-bit \p unsigned \p integer
+ * type abstraction used for storage of tetrads of
+ * \p fp8 floating-point numbers.
+ */
+typedef unsigned int __nv_fp8x4_storage_t;
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Enumerates the modes applicable when
+ * performing a narrowing conversion to \p fp8 destination types.
+ */
+typedef enum __nv_saturation_t {
+    /**
+     * Means no saturation to finite is performed when conversion
+     * results in rounding values outside the range of destination
+     * type.
+     * NOTE: for fp8 type of e4m3 kind, the results that are larger
+     * than the maximum representable finite number of the target
+     * format become NaN.
+     */
+    __NV_NOSAT,
+    /**
+     * Means input larger than the maximum representable
+     * finite number MAXNORM of the target format round to the
+     * MAXNORM of the same sign as input.
+     */
+    __NV_SATFINITE,
+} __nv_saturation_t;
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Enumerates the possible
+ * interpretations of the 8-bit values when referring to them as
+ * \p fp8 types.
+ */
+typedef enum __nv_fp8_interpretation_t {
+    __NV_E4M3, /**< Stands for \p fp8 numbers of \p e4m3 kind. */
+    __NV_E5M2, /**< Stands for \p fp8 numbers of \p e5m2 kind. */
+} __nv_fp8_interpretation_t;
+/* Forward-declaration of C-style APIs */
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p double precision \p x to \p fp8 type of the
+ * requested kind using round-to-nearest-even rounding and requested saturation
+ * mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
+                       const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p double precision numbers packed
+ * in \p double2 \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
+                          const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p single precision \p x to \p fp8 type of the
+ * requested kind using round-to-nearest-even rounding and requested saturation
+ * mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
+                      const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p single precision numbers packed
+ * in \p float2 \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
+                         const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p half precision \p x to \p fp8 type of the requested
+ * kind using round-to-nearest-even rounding and requested saturation mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
+__nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
+                        const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p half precision numbers packed
+ * in \p __half2_raw \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
+    const __half2_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p nv_bfloat16 precision \p x to \p fp8 type of the
+ * requested kind using round-to-nearest-even rounding and requested saturation
+ * mode.
+ *
+ * \details Converts input \p x to \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter,
+ * using round-to-nearest-even rounding and
+ * saturation mode specified by \p saturate parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
+    const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p nv_bfloat16 precision numbers packed
+ * in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp8 type of
+ * the requested kind using round-to-nearest-even rounding and requested
+ * saturation mode.
+ *
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
+ * kind specified by \p fp8_interpretation parameter, using
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
+ * parameter.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
+__nv_cvt_bfloat16raw2_to_fp8x2(
+    const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
+    const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p fp8 \p x of the specified kind
+ * to \p half precision.
+ *
+ * \details Converts input \p x of \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter
+ * to \p half precision.
+ *
+ * \returns
+ * - The \p __half_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
+__nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
+                        const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input vector of two \p fp8 values of the specified kind
+ * to a vector of two \p half precision values packed in \p __half2_raw
+ * structure.
+ *
+ * \details Converts input vector \p x of \p fp8 type of the kind specified by
+ * \p fp8_interpretation parameter
+ * to a vector of two \p half precision values and returns as \p __half2_raw
+ * structure.
+ *
+ * \returns
+ * - The \p __half2_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
+__nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
+                           const __nv_fp8_interpretation_t fp8_interpretation);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p bfloat16 input into a scaling factor of \p e8m0 kind.
+ *
+ * \details Input number's absolute value is rounded to the closest power of two in the
+ * direction specified via \p rounding parameter. Rounded results that are
+ * smaller than the smallest representable target format number 2^-127 are then
+ * clipped to 2^-127. Results that are larger than the largest representable
+ * target format number 2^127 are either clipped to 2^127 if \p saturate equals
+ * to \p __NV_SATFINITE, or convert to \p NaN otherwise. \p NaN inputs convert
+ * into \p NaN output, encoded as \p 0xFF in the target format.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_e8m0(const __nv_bfloat16_raw x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts a pair of \p bfloat16 values into a pair of scaling factors of \p e8m0 kind.
+ *
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_bfloat162raw_to_e8m0x2(const __nv_bfloat162_raw x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p float value into a scaling factor of \p e8m0 kind.
+ *
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_float_to_e8m0(const float x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts a pair of \p float values into a pair of scaling factors of \p e8m0 kind.
+ *
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_float2_to_e8m0x2(const float2 x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input \p double value into a scaling factor of \p e8m0 kind.
+ *
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
+ *
+ * \returns
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_double_to_e8m0(const double x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts a pair of \p double values into a pair of scaling factors of \p e8m0 kind.
+ *
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
+ *
+ * \returns
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_double2_to_e8m0x2(const double2 x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input scaling factor value of \p e8m0 kind into \p bfloat16.
+ *
+ * \details Input scales are exact powers of two or a \p NaN value,
+ * also representable in the target format.
+ *
+ * \returns
+ * - The \p __nv_bfloat16_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw __nv_cvt_e8m0_to_bf16raw(const __nv_fp8_storage_t x);
+/**
+ * \ingroup CUDA_MATH_FP8_MISC
+ * \brief Converts input pair of scaling factors of \p e8m0 kind into a pair of \p bfloat16 values.
+ *
+ * \returns
+ * - The \p __nv_bfloat162_raw value holds the result of conversion.
+ */
+__CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat162_raw __nv_cvt_e8m0x2_to_bf162raw(const __nv_fp8x2_storage_t x);
+#if defined(__cplusplus)
+#define __CUDA_FP8_TYPES_EXIST__
+/* Forward-declaration of structures defined in "cuda_fp8.hpp" */
+struct __nv_fp8_e5m2;
+struct __nv_fp8x2_e5m2;
+struct __nv_fp8x4_e5m2;
+struct __nv_fp8_e4m3;
+struct __nv_fp8x2_e4m3;
+struct __nv_fp8x4_e4m3;
+struct __nv_fp8_e8m0;
+struct __nv_fp8x2_e8m0;
+struct __nv_fp8x4_e8m0;
+#endif /* defined(__cplusplus) */
+#include "cuda_fp8.hpp"
+#undef __CUDA_FP8_DECL__
+#undef __CUDA_HOSTDEVICE_FP8__
+#undef __CUDA_HOSTDEVICE_FP8_DECL__
+#if defined(__CPP_VERSION_AT_LEAST_11_FP8)
+#undef __CPP_VERSION_AT_LEAST_11_FP8
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
+#endif /* end of include guard: __CUDA_FP8_H__ */

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_occupancy.h ADDED Viewed

	@@ -0,0 +1,2094 @@

+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/**
+ * CUDA Occupancy Calculator
+ *
+ * NAME
+ *
+ *   cudaOccMaxActiveBlocksPerMultiprocessor,
+ *   cudaOccMaxPotentialOccupancyBlockSize,
+ *   cudaOccMaxPotentialOccupancyBlockSizeVariableSMem
+ *   cudaOccAvailableDynamicSMemPerBlock
+ *
+ * DESCRIPTION
+ *
+ *   The CUDA occupancy calculator provides a standalone, programmatical
+ *   interface to compute the occupancy of a function on a device. It can also
+ *   provide occupancy-oriented launch configuration suggestions.
+ *
+ *   The function and device are defined by the user through
+ *   cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState
+ *   structures. All APIs require all 3 of them.
+ *
+ *   See the structure definition for more details about the device / function
+ *   descriptors.
+ *
+ *   See each API's prototype for API usage.
+ *
+ * COMPATIBILITY
+ *
+ *   The occupancy calculator will be updated on each major CUDA toolkit
+ *   release. It does not provide forward compatibility, i.e. new hardwares
+ *   released after this implementation's release will not be supported.
+ *
+ * NOTE
+ *
+ *   If there is access to CUDA runtime, and the sole intent is to calculate
+ *   occupancy related values on one of the accessible CUDA devices, using CUDA
+ *   runtime's occupancy calculation APIs is recommended.
+ *
+ */
+#ifndef __cuda_occupancy_h__
+#define __cuda_occupancy_h__
+#include <stddef.h>
+#include <limits.h>
+#include <string.h>
+// __OCC_INLINE will be undefined at the end of this header
+//
+#ifdef __CUDACC__
+#define __OCC_INLINE inline __host__ __device__
+#elif defined _MSC_VER
+#define __OCC_INLINE __inline
+#else // GNUCC assumed
+#define __OCC_INLINE inline
+#endif
+enum cudaOccError_enum {
+    CUDA_OCC_SUCCESS              = 0,  // no error encountered
+    CUDA_OCC_ERROR_INVALID_INPUT  = 1,  // input parameter is invalid
+    CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2,  // requested device is not supported in
+                                        // current implementation or device is
+                                        // invalid
+};
+typedef enum cudaOccError_enum       cudaOccError;
+typedef struct cudaOccResult         cudaOccResult;
+typedef struct cudaOccDeviceProp     cudaOccDeviceProp;
+typedef struct cudaOccFuncAttributes cudaOccFuncAttributes;
+typedef struct cudaOccDeviceState    cudaOccDeviceState;
+/**
+ * The CUDA occupancy calculator computes the occupancy of the function
+ * described by attributes with the given block size (blockSize), static device
+ * properties (properties), dynamic device states (states) and per-block dynamic
+ * shared memory allocation (dynamicSMemSize) in bytes, and output it through
+ * result along with other useful information. The occupancy is computed in
+ * terms of the maximum number of active blocks per multiprocessor. The user can
+ * then convert it to other metrics, such as number of active warps.
+ *
+ * RETURN VALUE
+ *
+ * The occupancy and related information is returned through result.
+ *
+ * If result->activeBlocksPerMultiprocessor is 0, then the given parameter
+ * combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ */
+static __OCC_INLINE
+cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
+    cudaOccResult               *result,           // out
+    const cudaOccDeviceProp     *properties,       // in
+    const cudaOccFuncAttributes *attributes,       // in
+    const cudaOccDeviceState    *state,            // in
+    int                          blockSize,        // in
+    size_t                       dynamicSmemSize); // in
+/**
+ * The CUDA launch configurator C API suggests a grid / block size pair (in
+ * minGridSize and blockSize) that achieves the best potential occupancy
+ * (i.e. maximum number of active warps with the smallest number of blocks) for
+ * the given function described by attributes, on a device described by
+ * properties with settings in state.
+ *
+ * If per-block dynamic shared memory allocation is not needed, the user should
+ * leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0.
+ *
+ * If per-block dynamic shared memory allocation is needed, then if the dynamic
+ * shared memory size is constant regardless of block size, the size should be
+ * passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be
+ * NULL.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with different
+ * block sizes, the user needs to provide a pointer to an unary function through
+ * blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by
+ * a block of the function for any given block size. dynamicSMemSize is
+ * ignored. An example signature is:
+ *
+ *    // Take block size, returns dynamic shared memory needed
+ *    size_t blockToSmem(int blockSize);
+ *
+ * RETURN VALUE
+ *
+ * The suggested block size and the minimum number of blocks needed to achieve
+ * the maximum occupancy are returned through blockSize and minGridSize.
+ *
+ * If *blockSize is 0, then the given combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+static __OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,      // out
+    int                         *blockSize,        // out
+    const cudaOccDeviceProp     *properties,       // in
+    const cudaOccFuncAttributes *attributes,       // in
+    const cudaOccDeviceState    *state,            // in
+    size_t                     (*blockSizeToDynamicSMemSize)(int), // in
+    size_t                       dynamicSMemSize); // in
+/**
+ * The CUDA launch configurator C++ API suggests a grid / block size pair (in
+ * minGridSize and blockSize) that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number of blocks)
+ * for the given function described by attributes, on a device described by
+ * properties with settings in state.
+ *
+ * If per-block dynamic shared memory allocation is 0 or constant regardless of
+ * block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to
+ * configure the launch. A constant dynamic shared memory allocation size in
+ * bytes can be passed through dynamicSMemSize.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with different
+ * block sizes, the user needs to use
+ * cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a
+ * functor / pointer to an unary function (blockSizeToDynamicSMemSize) that
+ * computes the dynamic shared memory needed by func for any given block
+ * size. An example signature is:
+ *
+ *  // Take block size, returns per-block dynamic shared memory needed
+ *  size_t blockToSmem(int blockSize);
+ *
+ * RETURN VALUE
+ *
+ * The suggested block size and the minimum number of blocks needed to achieve
+ * the maximum occupancy are returned through blockSize and minGridSize.
+ *
+ * If *blockSize is 0, then the given combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+#if defined(__cplusplus)
+namespace {
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,          // out
+    int                         *blockSize,            // out
+    const cudaOccDeviceProp     *properties,           // in
+    const cudaOccFuncAttributes *attributes,           // in
+    const cudaOccDeviceState    *state,                // in
+    size_t                       dynamicSMemSize = 0); // in
+template <typename UnaryFunction>
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
+    int                         *minGridSize,          // out
+    int                         *blockSize,            // out
+    const cudaOccDeviceProp     *properties,           // in
+    const cudaOccFuncAttributes *attributes,           // in
+    const cudaOccDeviceState    *state,                // in
+    UnaryFunction                blockSizeToDynamicSMemSize); // in
+} // namespace anonymous
+#endif // defined(__cplusplus)
+/**
+ *
+ * The CUDA dynamic shared memory calculator computes the maximum size of
+ * per-block dynamic shared memory if we want to place numBlocks blocks
+ * on an SM.
+ *
+ * RETURN VALUE
+ *
+ * Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow
+ * numBlocks blocks per SM.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+static __OCC_INLINE
+cudaOccError cudaOccAvailableDynamicSMemPerBlock(
+    size_t                      *dynamicSmemSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                         numBlocks,
+    int                         blockSize);
+/**
+ * Data structures
+ *
+ * These structures are subject to change for future architecture and CUDA
+ * releases. C users should initialize the structure as {0}.
+ *
+ */
+/**
+ * Device descriptor
+ *
+ * This structure describes a device.
+ */
+struct cudaOccDeviceProp {
+    int    computeMajor;                // Compute capability major version
+    int    computeMinor;                // Compute capability minor
+                                        // version. None supported minor version
+                                        // may cause error
+    int    maxThreadsPerBlock;          // Maximum number of threads per block
+    int    maxThreadsPerMultiprocessor; // Maximum number of threads per SM
+                                        // i.e. (Max. number of warps) x (warp
+                                        // size)
+    int    regsPerBlock;                // Maximum number of registers per block
+    int    regsPerMultiprocessor;       // Maximum number of registers per SM
+    int    warpSize;                    // Warp size
+    size_t sharedMemPerBlock;           // Maximum shared memory size per block
+    size_t sharedMemPerMultiprocessor;  // Maximum shared memory size per SM
+    int    numSms;                      // Number of SMs available
+    size_t sharedMemPerBlockOptin;      // Maximum optin shared memory size per block
+    size_t reservedSharedMemPerBlock;   // Shared memory per block reserved by driver
+#ifdef __cplusplus
+    // This structure can be converted from a cudaDeviceProp structure for users
+    // that use this header in their CUDA applications.
+    //
+    // If the application have access to the CUDA Runtime API, the application
+    // can obtain the device properties of a CUDA device through
+    // cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the
+    // cudaDeviceProp structure.
+    //
+    // Example:
+    /*
+     {
+         cudaDeviceProp prop;
+         cudaGetDeviceProperties(&prop, ...);
+         cudaOccDeviceProp occProp = prop;
+         ...
+         cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...);
+     }
+     */
+    //
+    template<typename DeviceProp>
+    __OCC_INLINE
+    cudaOccDeviceProp(const DeviceProp &props)
+    :   computeMajor                (props.major),
+        computeMinor                (props.minor),
+        maxThreadsPerBlock          (props.maxThreadsPerBlock),
+        maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor),
+        regsPerBlock                (props.regsPerBlock),
+        regsPerMultiprocessor       (props.regsPerMultiprocessor),
+        warpSize                    (props.warpSize),
+        sharedMemPerBlock           (props.sharedMemPerBlock),
+        sharedMemPerMultiprocessor  (props.sharedMemPerMultiprocessor),
+        numSms                      (props.multiProcessorCount),
+        sharedMemPerBlockOptin      (props.sharedMemPerBlockOptin),
+        reservedSharedMemPerBlock   (props.reservedSharedMemPerBlock)
+    {}
+    __OCC_INLINE
+    cudaOccDeviceProp()
+    :   computeMajor                (0),
+        computeMinor                (0),
+        maxThreadsPerBlock          (0),
+        maxThreadsPerMultiprocessor (0),
+        regsPerBlock                (0),
+        regsPerMultiprocessor       (0),
+        warpSize                    (0),
+        sharedMemPerBlock           (0),
+        sharedMemPerMultiprocessor  (0),
+        numSms                      (0),
+        sharedMemPerBlockOptin      (0),
+        reservedSharedMemPerBlock   (0)
+    {}
+#endif // __cplusplus
+};
+/**
+ * Partitioned global caching option
+ */
+typedef enum cudaOccPartitionedGCConfig_enum {
+    PARTITIONED_GC_OFF,        // Disable partitioned global caching
+    PARTITIONED_GC_ON,         // Prefer partitioned global caching
+    PARTITIONED_GC_ON_STRICT   // Force partitioned global caching
+} cudaOccPartitionedGCConfig;
+/**
+ * Per function opt in maximum dynamic shared memory limit
+ */
+typedef enum cudaOccFuncShmemConfig_enum {
+    FUNC_SHMEM_LIMIT_DEFAULT,   // Default shmem limit
+    FUNC_SHMEM_LIMIT_OPTIN,     // Use the optin shmem limit
+} cudaOccFuncShmemConfig;
+/**
+ * Function descriptor
+ *
+ * This structure describes a CUDA function.
+ */
+struct cudaOccFuncAttributes {
+    int maxThreadsPerBlock; // Maximum block size the function can work with. If
+                            // unlimited, use INT_MAX or any value greater than
+                            // or equal to maxThreadsPerBlock of the device
+    int numRegs;            // Number of registers used. When the function is
+                            // launched on device, the register count may change
+                            // due to internal tools requirements.
+    size_t sharedSizeBytes; // Number of static shared memory used
+    cudaOccPartitionedGCConfig partitionedGCConfig;
+                            // Partitioned global caching is required to enable
+                            // caching on certain chips, such as sm_52
+                            // devices. Partitioned global caching can be
+                            // automatically disabled if the occupancy
+                            // requirement of the launch cannot support caching.
+                            //
+                            // To override this behavior with caching on and
+                            // calculate occupancy strictly according to the
+                            // preference, set partitionedGCConfig to
+                            // PARTITIONED_GC_ON_STRICT. This is especially
+                            // useful for experimenting and finding launch
+                            // configurations (MaxPotentialOccupancyBlockSize)
+                            // that allow global caching to take effect.
+                            //
+                            // This flag only affects the occupancy calculation.
+    cudaOccFuncShmemConfig shmemLimitConfig;
+                            // Certain chips like sm_70 allow a user to opt into
+                            // a higher per block limit of dynamic shared memory
+                            // This optin is performed on a per function basis
+                            // using the cuFuncSetAttribute function
+    size_t maxDynamicSharedSizeBytes;
+                            // User set limit on maximum dynamic shared memory
+                            // usable by the kernel
+                            // This limit is set using the cuFuncSetAttribute
+                            // function.
+    int numBlockBarriers;   // Number of block barriers used (default to 1)
+#ifdef __cplusplus
+    // This structure can be converted from a cudaFuncAttributes structure for
+    // users that use this header in their CUDA applications.
+    //
+    // If the application have access to the CUDA Runtime API, the application
+    // can obtain the function attributes of a CUDA kernel function through
+    // cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the
+    // cudaFuncAttributes structure.
+    //
+    // Example:
+    /*
+      __global__ void foo() {...}
+      ...
+      {
+          cudaFuncAttributes attr;
+          cudaFuncGetAttributes(&attr, foo);
+          cudaOccFuncAttributes occAttr = attr;
+          ...
+          cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...);
+      }
+     */
+    //
+    template<typename FuncAttributes>
+    __OCC_INLINE
+    cudaOccFuncAttributes(const FuncAttributes &attr)
+    :   maxThreadsPerBlock  (attr.maxThreadsPerBlock),
+        numRegs             (attr.numRegs),
+        sharedSizeBytes     (attr.sharedSizeBytes),
+        partitionedGCConfig (PARTITIONED_GC_OFF),
+        shmemLimitConfig    (FUNC_SHMEM_LIMIT_OPTIN),
+        maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes),
+        numBlockBarriers    (1)
+    {}
+    __OCC_INLINE
+    cudaOccFuncAttributes()
+    :   maxThreadsPerBlock  (0),
+        numRegs             (0),
+        sharedSizeBytes     (0),
+        partitionedGCConfig (PARTITIONED_GC_OFF),
+        shmemLimitConfig    (FUNC_SHMEM_LIMIT_DEFAULT),
+        maxDynamicSharedSizeBytes (0),
+        numBlockBarriers    (0)
+    {}
+#endif
+};
+typedef enum cudaOccCacheConfig_enum {
+    CACHE_PREFER_NONE   = 0x00, // no preference for shared memory or L1 (default)
+    CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache
+    CACHE_PREFER_L1     = 0x02, // prefer larger L1 cache and smaller shared memory
+    CACHE_PREFER_EQUAL  = 0x03  // prefer equal sized L1 cache and shared memory
+} cudaOccCacheConfig;
+typedef enum cudaOccCarveoutConfig_enum {
+    SHAREDMEM_CARVEOUT_DEFAULT       = -1,  // no preference for shared memory or L1 (default)
+    SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, // prefer maximum available shared memory, minimum L1 cache
+    SHAREDMEM_CARVEOUT_MAX_L1        = 0,    // prefer maximum available L1 cache, minimum shared memory
+    SHAREDMEM_CARVEOUT_HALF          = 50   // prefer half of maximum available shared memory, with the rest as L1 cache
+} cudaOccCarveoutConfig;
+/**
+ * Device state descriptor
+ *
+ * This structure describes device settings that affect occupancy calculation.
+ */
+struct cudaOccDeviceState
+{
+    // Cache / shared memory split preference. Deprecated on Volta
+    cudaOccCacheConfig cacheConfig;
+    // Shared memory / L1 split preference. Supported on only Volta
+    int carveoutConfig;
+#ifdef __cplusplus
+    __OCC_INLINE
+    cudaOccDeviceState()
+    :   cacheConfig     (CACHE_PREFER_NONE),
+        carveoutConfig  (SHAREDMEM_CARVEOUT_DEFAULT)
+    {}
+#endif
+};
+typedef enum cudaOccLimitingFactor_enum {
+                                    // Occupancy limited due to:
+    OCC_LIMIT_WARPS         = 0x01, // - warps available
+    OCC_LIMIT_REGISTERS     = 0x02, // - registers available
+    OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available
+    OCC_LIMIT_BLOCKS        = 0x08, // - blocks available
+    OCC_LIMIT_BARRIERS      = 0x10  // - barrier available
+} cudaOccLimitingFactor;
+/**
+ * Occupancy output
+ *
+ * This structure contains occupancy calculator's output.
+ */
+struct cudaOccResult {
+    int activeBlocksPerMultiprocessor; // Occupancy
+    unsigned int limitingFactors;      // Factors that limited occupancy. A bit
+                                       // field that counts the limiting
+                                       // factors, see cudaOccLimitingFactor
+    int blockLimitRegs;                // Occupancy due to register
+                                       // usage, INT_MAX if the kernel does not
+                                       // use any register.
+    int blockLimitSharedMem;           // Occupancy due to shared memory
+                                       // usage, INT_MAX if the kernel does not
+                                       // use shared memory.
+    int blockLimitWarps;               // Occupancy due to block size limit
+    int blockLimitBlocks;              // Occupancy due to maximum number of blocks
+                                       // managable per SM
+    int blockLimitBarriers;            // Occupancy due to block barrier usage
+    int allocatedRegistersPerBlock;    // Actual number of registers allocated per
+                                       // block
+    size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated
+                                       // per block
+    cudaOccPartitionedGCConfig partitionedGCConfig;
+                                       // Report if partitioned global caching
+                                       // is actually enabled.
+};
+/**
+ * Partitioned global caching support
+ *
+ * See cudaOccPartitionedGlobalCachingModeSupport
+ */
+typedef enum cudaOccPartitionedGCSupport_enum {
+    PARTITIONED_GC_NOT_SUPPORTED,  // Partitioned global caching is not supported
+    PARTITIONED_GC_SUPPORTED,      // Partitioned global caching is supported
+} cudaOccPartitionedGCSupport;
+/**
+ * Implementation
+ */
+/**
+ * Max compute capability supported
+ */
+#define __CUDA_OCC_MAJOR__ 12
+#define __CUDA_OCC_MINOR__ 0
+//////////////////////////////////////////
+//    Mathematical Helper Functions     //
+//////////////////////////////////////////
+static __OCC_INLINE int __occMin(int lhs, int rhs)
+{
+    return rhs < lhs ? rhs : lhs;
+}
+static __OCC_INLINE int __occDivideRoundUp(int x, int y)
+{
+    return (x + (y - 1)) / y;
+}
+static __OCC_INLINE int __occRoundUp(int x, int y)
+{
+    return y * __occDivideRoundUp(x, y);
+}
+//////////////////////////////////////////
+//      Architectural Properties        //
+//////////////////////////////////////////
+/**
+ * Granularity of shared memory allocation
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+        case 7:
+            value = 256;
+            break;
+        case 8:
+        case 9:
+        case 10:
+        case 12:
+            value = 128;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    *limit = value;
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Maximum number of registers per thread
+ */
+static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+            value = 255;
+            break;
+        case 7:
+        case 8:
+        case 9:
+        case 10:
+        case 12:
+            value = 256;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    *limit = value;
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Granularity of register allocation
+ */
+static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+        case 7:
+        case 8:
+        case 9:
+        case 10:
+        case 12:
+            value = 256;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    *limit = value;
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Number of sub-partitions
+ */
+static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 7:
+        case 8:
+        case 9:
+        case 10:
+        case 12:
+            value = 4;
+            break;
+        case 6:
+            value = properties->computeMinor ? 4 : 2;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    *limit = value;
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Maximum number of blocks that can run simultaneously on a multiprocessor
+ */
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+    switch(properties->computeMajor) {
+        case 3:
+            value = 16;
+            break;
+        case 5:
+        case 6:
+            value = 32;
+            break;
+        case 7: {
+            int isTuring = properties->computeMinor == 5;
+            value = (isTuring) ? 16 : 32;
+            break;
+        }
+        case 8:
+            if (properties->computeMinor == 0) {
+                value = 32;
+            }
+            else if (properties->computeMinor == 9) {
+                value = 24;
+            }
+            else {
+                value = 16;
+            }
+            break;
+        case 9:
+            value = 32;
+            break;
+        case 10:
+            switch(properties->computeMinor) {
+                case 1 :
+                    value = 24;
+                    break;
+                case 0 : /* explicitly added to avoid build failure in WDDM driver components */
+                default :
+                    value = 32;
+            }
+            break;
+        case 12:
+            value = 24;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    *limit = value;
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Align up shared memory based on compute major configurations
+ */
+static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties)
+{
+    // Volta and Turing have shared L1 cache / shared memory, and support cache
+    // configuration to trade one for the other. These values are needed to
+    // map carveout config ratio to the next available architecture size
+    size_t size = *shMemSize;
+    switch (properties->computeMajor) {
+    case 7: {
+        // Turing supports 32KB and 64KB shared mem.
+        int isTuring = properties->computeMinor == 5;
+        if (isTuring) {
+            if      (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        // Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem.
+        else {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 96 * 1024) {
+                *shMemSize = 96 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        break;
+    }
+    case 8:
+        if (properties->computeMinor == 0 || properties->computeMinor == 7) {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 100 * 1024) {
+                *shMemSize = 100 * 1024;
+            }
+            else if (size <= 132 * 1024) {
+                *shMemSize = 132 * 1024;
+            }
+            else if (size <= 164 * 1024) {
+                *shMemSize = 164 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        else {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 100 * 1024) {
+                *shMemSize = 100 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        break;
+    case 9: {
+        if      (size == 0) {
+            *shMemSize = 0;
+        }
+        else if (size <= 8 * 1024) {
+            *shMemSize = 8 * 1024;
+        }
+        else if (size <= 16 * 1024) {
+            *shMemSize = 16 * 1024;
+        }
+        else if (size <= 32 * 1024) {
+            *shMemSize = 32 * 1024;
+        }
+        else if (size <= 64 * 1024) {
+            *shMemSize = 64 * 1024;
+        }
+        else if (size <= 100 * 1024) {
+            *shMemSize = 100 * 1024;
+        }
+        else if (size <= 132 * 1024) {
+            *shMemSize = 132 * 1024;
+        }
+        else if (size <= 164 * 1024) {
+            *shMemSize = 164 * 1024;
+        }
+        else if (size <= 196 * 1024) {
+            *shMemSize = 196 * 1024;
+        }
+        else if (size <= 228 * 1024) {
+            *shMemSize = 228 * 1024;
+        }
+        else {
+            return CUDA_OCC_ERROR_INVALID_INPUT;
+        }
+        break;
+    }
+    case 10: {
+        switch (properties->computeMinor) {
+    // GB10x GPUs in Blackwell family have the below compute minors and corresponding
+    // shared memory configs
+            case 0:
+            case 1:
+                if      (size == 0) {
+                    *shMemSize = 0;
+                }
+                else if (size <= 8 * 1024) {
+                    *shMemSize = 8 * 1024;
+                }
+                else if (size <= 16 * 1024) {
+                    *shMemSize = 16 * 1024;
+                }
+                else if (size <= 32 * 1024) {
+                    *shMemSize = 32 * 1024;
+                }
+                else if (size <= 64 * 1024) {
+                    *shMemSize = 64 * 1024;
+                }
+                else if (size <= 100 * 1024) {
+                    *shMemSize = 100 * 1024;
+                }
+                else if (size <= 132 * 1024) {
+                    *shMemSize = 132 * 1024;
+                }
+                else if (size <= 164 * 1024) {
+                    *shMemSize = 164 * 1024;
+                }
+                else if (size <= 196 * 1024) {
+                    *shMemSize = 196 * 1024;
+                }
+                else if (size <= 228 * 1024) {
+                    *shMemSize = 228 * 1024;
+                }
+                else {
+                    return CUDA_OCC_ERROR_INVALID_INPUT;
+                }
+                break;
+            default:
+                return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+        }
+        break;
+    }
+    case 12: {
+        switch (properties->computeMinor) {
+            case 0:
+                if      (size == 0) {
+                    *shMemSize = 0;
+                }
+                else if (size <= 8 * 1024) {
+                    *shMemSize = 8 * 1024;
+                }
+                else if (size <= 16 * 1024) {
+                    *shMemSize = 16 * 1024;
+                }
+                else if (size <= 32 * 1024) {
+                    *shMemSize = 32 * 1024;
+                }
+                else if (size <= 64 * 1024) {
+                    *shMemSize = 64 * 1024;
+                }
+                else if (size <= 100 * 1024) {
+                    *shMemSize = 100 * 1024;
+                }
+                else {
+                    return CUDA_OCC_ERROR_INVALID_INPUT;
+                }
+                break;
+            default:
+                return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+        }
+        break;
+    }
+    break;
+    default:
+        return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Shared memory based on the new carveoutConfig API introduced with Volta
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    size_t preferenceShmemSize;
+    // CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported
+    // devices. This preference will take precedence over the older cacheConfig setting.
+    // Map cacheConfig to its effective preference value.
+    int effectivePreference = state->carveoutConfig;
+    if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
+        switch (state->cacheConfig)
+        {
+        case CACHE_PREFER_L1:
+            effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1;
+            break;
+        case CACHE_PREFER_SHARED:
+            effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED;
+            break;
+        case CACHE_PREFER_EQUAL:
+            effectivePreference = SHAREDMEM_CARVEOUT_HALF;
+            break;
+        default:
+            effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT;
+            break;
+        }
+    }
+    if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
+        preferenceShmemSize = properties->sharedMemPerMultiprocessor;
+    }
+    else {
+        preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100;
+    }
+    status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties);
+    *limit = preferenceShmemSize;
+    return status;
+}
+/**
+ * Shared memory based on the cacheConfig
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    size_t bytes                          = 0;
+    size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor;
+    cudaOccCacheConfig cacheConfig        = state->cacheConfig;
+    // Kepler has shared L1 cache / shared memory, and support cache
+    // configuration to trade one for the other. These values are needed to
+    // calculate the correct shared memory size for user requested cache
+    // configuration.
+    //
+    size_t minCacheSize                   = 16384;
+    size_t maxCacheSize                   = 49152;
+    size_t cacheAndSharedTotal            = sharedMemPerMultiprocessorHigh + minCacheSize;
+    size_t sharedMemPerMultiprocessorLow  = cacheAndSharedTotal - maxCacheSize;
+    switch (properties->computeMajor) {
+        case 3:
+            // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
+            // is shared memory.
+            //
+            switch (cacheConfig) {
+                default :
+                case CACHE_PREFER_NONE:
+                case CACHE_PREFER_SHARED:
+                    bytes = sharedMemPerMultiprocessorHigh;
+                    break;
+                case CACHE_PREFER_L1:
+                    bytes = sharedMemPerMultiprocessorLow;
+                    break;
+                case CACHE_PREFER_EQUAL:
+                    // Equal is the mid-point between high and low. It should be
+                    // equivalent to low + 16KB.
+                    //
+                    bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
+                    break;
+            }
+            break;
+        case 5:
+        case 6:
+            // Maxwell and Pascal have dedicated shared memory.
+            //
+            bytes = sharedMemPerMultiprocessorHigh;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    *limit = bytes;
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Shared memory based on config requested by User
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    // Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference,
+    // it is handled separately from the cache config preference.
+    if (properties->computeMajor >= 7) {
+        return cudaOccSMemPreferenceVoltaPlus(limit, properties, state);
+    }
+    return cudaOccSMemPreference(limit, properties, state);
+}
+/**
+ * Return the per block shared memory limit based on function config
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta)
+{
+    switch (properties->computeMajor) {
+        case 2:
+        case 3:
+        case 4:
+        case 5:
+        case 6:
+            *limit = properties->sharedMemPerBlock;
+            break;
+        case 7:
+        case 8:
+        case 9:
+        case 10:
+        case 12:
+            switch (shmemLimitConfig) {
+                default:
+                case FUNC_SHMEM_LIMIT_DEFAULT:
+                    *limit = properties->sharedMemPerBlock;
+                    break;
+                case FUNC_SHMEM_LIMIT_OPTIN:
+                    if (smemPerCta > properties->sharedMemPerBlock) {
+                        *limit = properties->sharedMemPerBlockOptin;
+                    }
+                    else {
+                        *limit = properties->sharedMemPerBlock;
+                    }
+                    break;
+            }
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    // Starting Ampere, CUDA driver reserves additional shared memory per block
+    if (properties->computeMajor >= 8) {
+        *limit += properties->reservedSharedMemPerBlock;
+    }
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Partitioned global caching mode support
+ */
+static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties)
+{
+    *limit = PARTITIONED_GC_NOT_SUPPORTED;
+    if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) ||
+        properties->computeMajor == 6) {
+        *limit = PARTITIONED_GC_SUPPORTED;
+    }
+    if (properties->computeMajor == 6 && properties->computeMinor == 0) {
+        *limit = PARTITIONED_GC_NOT_SUPPORTED;
+    }
+    return CUDA_OCC_SUCCESS;
+}
+///////////////////////////////////////////////
+//            User Input Sanity              //
+///////////////////////////////////////////////
+static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties)
+{
+    // Verify device properties
+    //
+    // Each of these limits must be a positive number.
+    //
+    // Compute capacity is checked during the occupancy calculation
+    //
+    if (properties->maxThreadsPerBlock          <= 0 ||
+        properties->maxThreadsPerMultiprocessor <= 0 ||
+        properties->regsPerBlock                <= 0 ||
+        properties->regsPerMultiprocessor       <= 0 ||
+        properties->warpSize                    <= 0 ||
+        properties->sharedMemPerBlock           <= 0 ||
+        properties->sharedMemPerMultiprocessor  <= 0 ||
+        properties->numSms                      <= 0) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    return CUDA_OCC_SUCCESS;
+}
+static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes)
+{
+    // Verify function attributes
+    //
+    if (attributes->maxThreadsPerBlock <= 0 ||
+        attributes->numRegs < 0) {            // Compiler may choose not to use
+                                              // any register (empty kernels,
+                                              // etc.)
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    return CUDA_OCC_SUCCESS;
+}
+static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state)
+{
+    (void)state;   // silence unused-variable warning
+    // Placeholder
+    //
+    return CUDA_OCC_SUCCESS;
+}
+static __OCC_INLINE cudaOccError cudaOccInputCheck(
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    status = cudaOccDevicePropCheck(properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    status = cudaOccFuncAttributesCheck(attributes);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    status = cudaOccDeviceStateCheck(state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    return status;
+}
+///////////////////////////////////////////////
+//    Occupancy calculation Functions        //
+///////////////////////////////////////////////
+static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes)
+{
+    cudaOccPartitionedGCSupport gcSupport;
+    cudaOccPartitionedGCConfig gcConfig;
+    cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties);
+    gcConfig = attributes->partitionedGCConfig;
+    if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) {
+        gcConfig = PARTITIONED_GC_OFF;
+    }
+    return gcConfig;
+}
+// Warp limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit(
+    int                         *limit,
+    cudaOccPartitionedGCConfig   gcConfig,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    int                          blockSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int maxWarpsPerSm;
+    int warpsAllocatedPerCTA;
+    int maxBlocks;
+    (void)attributes;   // silence unused-variable warning
+    if (blockSize > properties->maxThreadsPerBlock) {
+        maxBlocks = 0;
+    }
+    else {
+        maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize;
+        warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
+        maxBlocks = 0;
+        if (gcConfig != PARTITIONED_GC_OFF) {
+            int maxBlocksPerSmPartition;
+            int maxWarpsPerSmPartition;
+            // If partitioned global caching is on, then a CTA can only use a SM
+            // partition (a half SM), and thus a half of the warp slots
+            // available per SM
+            //
+            maxWarpsPerSmPartition  = maxWarpsPerSm / 2;
+            maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
+            maxBlocks               = maxBlocksPerSmPartition * 2;
+        }
+        // On hardware that supports partitioned global caching, each half SM is
+        // guaranteed to support at least 32 warps (maximum number of warps of a
+        // CTA), so caching will not cause 0 occupancy due to insufficient warp
+        // allocation slots.
+        //
+        else {
+            maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
+        }
+    }
+    *limit = maxBlocks;
+    return status;
+}
+// Shared memory limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit(
+    int                         *limit,
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                          blockSize,
+    size_t                       dynamicSmemSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int allocationGranularity;
+    size_t userSmemPreference = 0;
+    size_t totalSmemUsagePerCTA;
+    size_t maxSmemUsagePerCTA;
+    size_t smemAllocatedPerCTA;
+    size_t staticSmemSize;
+    size_t sharedMemPerMultiprocessor;
+    size_t smemLimitPerCTA;
+    int maxBlocks;
+    int dynamicSmemSizeExceeded = 0;
+    int totalSmemSizeExceeded = 0;
+    (void)blockSize;   // silence unused-variable warning
+    status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Obtain the user preferred shared memory size. This setting is ignored if
+    // user requests more shared memory than preferred.
+    //
+    status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock;
+    totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize;
+    smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);
+    maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes;
+    dynamicSmemSizeExceeded = 0;
+    totalSmemSizeExceeded   = 0;
+    // Obtain the user set maximum dynamic size if it exists
+    // If so, the current launch dynamic shared memory must not
+    // exceed the set limit
+    if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT &&
+        dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) {
+        dynamicSmemSizeExceeded = 1;
+    }
+    status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    if (smemAllocatedPerCTA > smemLimitPerCTA) {
+        totalSmemSizeExceeded = 1;
+    }
+    if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) {
+        maxBlocks = 0;
+    }
+    else {
+        // User requested shared memory limit is used as long as it is greater
+        // than the total shared memory used per CTA, i.e. as long as at least
+        // one CTA can be launched.
+        if (userSmemPreference >= smemAllocatedPerCTA) {
+            sharedMemPerMultiprocessor = userSmemPreference;
+        }
+        else {
+            // On Volta+, user requested shared memory will limit occupancy
+            // if it's less than shared memory per CTA. Otherwise, the
+            // maximum shared memory limit is used.
+            if (properties->computeMajor >= 7) {
+                sharedMemPerMultiprocessor = smemAllocatedPerCTA;
+                status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties);
+                if (status != CUDA_OCC_SUCCESS) {
+                    return status;
+                }
+            }
+            else {
+                sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor;
+            }
+        }
+        if (smemAllocatedPerCTA > 0) {
+            maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
+        }
+        else {
+            maxBlocks = INT_MAX;
+        }
+    }
+    result->allocatedSharedMemPerBlock = smemAllocatedPerCTA;
+    *limit = maxBlocks;
+    return status;
+}
+static __OCC_INLINE
+cudaOccError cudaOccMaxBlocksPerSMRegsLimit(
+    int                         *limit,
+    cudaOccPartitionedGCConfig  *gcConfig,
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    int                          blockSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int allocationGranularity;
+    int warpsAllocatedPerCTA;
+    int regsAllocatedPerCTA;
+    int regsAssumedPerCTA;
+    int regsPerWarp;
+    int regsAllocatedPerWarp;
+    int numSubPartitions;
+    int numRegsPerSubPartition;
+    int numWarpsPerSubPartition;
+    int numWarpsPerSM;
+    int maxBlocks;
+    int maxRegsPerThread;
+    status = cudaOccRegAllocationGranularity(
+        &allocationGranularity,
+        properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    status = cudaOccRegAllocationMaxPerThread(
+        &maxRegsPerThread,
+        properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
+    // GPUs of compute capability 2.x and higher allocate registers to warps
+    //
+    // Number of regs per warp is regs per thread x warp size, rounded up to
+    // register allocation granularity
+    //
+    regsPerWarp          = attributes->numRegs * properties->warpSize;
+    regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
+    regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;
+    // Hardware verifies if a launch fits the per-CTA register limit. For
+    // historical reasons, the verification logic assumes register
+    // allocations are made to all partitions simultaneously. Therefore, to
+    // simulate the hardware check, the warp allocation needs to be rounded
+    // up to the number of partitions.
+    //
+    regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);
+    if (properties->regsPerBlock < regsAssumedPerCTA ||   // Hardware check
+        properties->regsPerBlock < regsAllocatedPerCTA || // Software check
+        attributes->numRegs > maxRegsPerThread) {         // Per thread limit check
+        maxBlocks = 0;
+    }
+    else {
+        if (regsAllocatedPerWarp > 0) {
+            // Registers are allocated in each sub-partition. The max number
+            // of warps that can fit on an SM is equal to the max number of
+            // warps per sub-partition x number of sub-partitions.
+            //
+            numRegsPerSubPartition  = properties->regsPerMultiprocessor / numSubPartitions;
+            numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;
+            maxBlocks = 0;
+            if (*gcConfig != PARTITIONED_GC_OFF) {
+                int numSubPartitionsPerSmPartition;
+                int numWarpsPerSmPartition;
+                int maxBlocksPerSmPartition;
+                // If partitioned global caching is on, then a CTA can only
+                // use a half SM, and thus a half of the registers available
+                // per SM
+                //
+                numSubPartitionsPerSmPartition = numSubPartitions / 2;
+                numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
+                maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
+                maxBlocks                      = maxBlocksPerSmPartition * 2;
+            }
+            // Try again if partitioned global caching is not enabled, or if
+            // the CTA cannot fit on the SM with caching on (maxBlocks == 0).  In the latter
+            // case, the device will automatically turn off caching, except
+            // if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate
+            // occupancy and launch configuration.
+            //
+            if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) {
+               // In case *gcConfig was PARTITIONED_GC_ON flip it OFF since
+               // this is what it will be if we spread CTA across partitions.
+               //
+               *gcConfig = PARTITIONED_GC_OFF;
+               numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
+               maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
+            }
+        }
+        else {
+            maxBlocks = INT_MAX;
+        }
+    }
+    result->allocatedRegistersPerBlock = regsAllocatedPerCTA;
+    *limit = maxBlocks;
+    return status;
+}
+// Barrier limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit(
+    int                         *limit,
+    int                          ctaLimitBlocks,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int numBarriersAvailable = 0;
+    int numBarriersUsed = attributes->numBlockBarriers;
+    int maxBlocks = INT_MAX;
+    switch(properties->computeMajor) {
+        case 5:
+        case 6:
+        case 7:
+            numBarriersAvailable = ctaLimitBlocks * 2;
+            break;
+        case 8:
+            if (properties->computeMinor == 0) {
+                numBarriersAvailable = ctaLimitBlocks * 2;
+            }
+            else {
+                numBarriersAvailable = ctaLimitBlocks;
+            }
+            break;
+        case 9:
+            numBarriersAvailable = ctaLimitBlocks * 2;
+            break;
+        case 10:
+            switch(properties->computeMinor) {
+                case 1 :
+                    numBarriersAvailable = ctaLimitBlocks;
+                    break;
+                case 0 : /* explicitly added to avoid build failure in WDDM driver components. */
+                default :
+                    numBarriersAvailable = ctaLimitBlocks * 2;
+            }
+            break;
+        case 12:
+            numBarriersAvailable = ctaLimitBlocks;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    if (numBarriersUsed) {
+        maxBlocks = numBarriersAvailable / numBarriersUsed;
+    }
+    *limit = maxBlocks;
+    return status;
+}
+///////////////////////////////////
+//      API Implementations      //
+///////////////////////////////////
+static __OCC_INLINE
+cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                          blockSize,
+    size_t                       dynamicSmemSize)
+{
+    cudaOccError status          = CUDA_OCC_SUCCESS;
+    int          ctaLimitWarps   = 0;
+    int          ctaLimitBlocks  = 0;
+    int          ctaLimitSMem    = 0;
+    int          ctaLimitRegs    = 0;
+    int          ctaLimitBars    = 0;
+    int          ctaLimit        = 0;
+    unsigned int limitingFactors = 0;
+    cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF;
+    if (!result || !properties || !attributes || !state || blockSize <= 0) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    ///////////////////////////
+    // Initialization
+    ///////////////////////////
+    gcConfig = cudaOccPartitionedGCExpected(properties, attributes);
+    ///////////////////////////
+    // Compute occupancy
+    ///////////////////////////
+    // Limits due to registers/SM
+    // Also compute if partitioned global caching has to be turned off
+    //
+    status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4.
+    // As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x.
+    // For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0),
+    // we do not let it run on any Pascal processor, even though it may be able to run on GP100.
+    // Therefore, we check the occupancy on GP10x when it can run on GP100
+    //
+    if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) {
+        cudaOccDeviceProp propertiesGP10x;
+        cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig;
+        int ctaLimitRegsGP10x = 0;
+        // Set up properties for GP10x
+        memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x));
+        propertiesGP10x.computeMinor = 1;
+        status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+        if (ctaLimitRegsGP10x == 0) {
+            ctaLimitRegs = 0;
+        }
+    }
+    // Limits due to warps/SM
+    //
+    status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Limits due to blocks/SM
+    //
+    status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Limits due to shared memory/SM
+    //
+    status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    ///////////////////////////
+    // Overall occupancy
+    ///////////////////////////
+    // Overall limit is min() of limits due to above reasons
+    //
+    ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));
+    // Determine occupancy limiting factors
+    //
+    if (ctaLimit == ctaLimitWarps) {
+        limitingFactors |= OCC_LIMIT_WARPS;
+    }
+    if (ctaLimit == ctaLimitRegs) {
+        limitingFactors |= OCC_LIMIT_REGISTERS;
+    }
+    if (ctaLimit == ctaLimitSMem) {
+        limitingFactors |= OCC_LIMIT_SHARED_MEMORY;
+    }
+    if (ctaLimit == ctaLimitBlocks) {
+        limitingFactors |= OCC_LIMIT_BLOCKS;
+    }
+    // For Hopper onwards compute the limits to occupancy based on block barrier count
+    //
+    if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) {
+        // Limits due to barrier/SM
+        //
+        status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, properties, attributes);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+        // Recompute overall limit based on barrier/SM
+        //
+        ctaLimit = __occMin(ctaLimitBars, ctaLimit);
+        // Determine if this is occupancy limiting factor
+        //
+        if (ctaLimit == ctaLimitBars) {
+            limitingFactors |= OCC_LIMIT_BARRIERS;
+        }
+    }
+    else {
+        ctaLimitBars = INT_MAX;
+    }
+    // Fill in the return values
+    //
+    result->limitingFactors = limitingFactors;
+    result->blockLimitRegs      = ctaLimitRegs;
+    result->blockLimitSharedMem = ctaLimitSMem;
+    result->blockLimitWarps     = ctaLimitWarps;
+    result->blockLimitBlocks    = ctaLimitBlocks;
+    result->blockLimitBarriers  = ctaLimitBars;
+    result->partitionedGCConfig = gcConfig;
+    // Final occupancy
+    result->activeBlocksPerMultiprocessor = ctaLimit;
+    return CUDA_OCC_SUCCESS;
+}
+static __OCC_INLINE
+cudaOccError cudaOccAvailableDynamicSMemPerBlock(
+    size_t                      *bytesAvailable,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                         numBlocks,
+    int                         blockSize)
+{
+    int allocationGranularity;
+    size_t smemLimitPerBlock;
+    size_t smemAvailableForDynamic;
+    size_t userSmemPreference = 0;
+    size_t sharedMemPerMultiprocessor;
+    cudaOccResult result;
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    if (numBlocks <= 0)
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    // First compute occupancy of potential kernel launch.
+    //
+    status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Check if occupancy is achievable given user requested number of blocks.
+    //
+    if (result.activeBlocksPerMultiprocessor < numBlocks) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Return the per block shared memory limit based on function config.
+    //
+    status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // If there is only a single block needed per SM, then the user preference can be ignored and the fully SW
+    // limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user
+    // preference sets the total limit of available shared memory.
+    //
+    cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
+    if (numBlocks == 1) {
+        sharedMemPerMultiprocessor = smemLimitPerBlock;
+    }
+    else {
+        if (!userSmemPreference) {
+            userSmemPreference = 1 ;
+            status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties);
+            if (status != CUDA_OCC_SUCCESS) {
+                return status;
+            }
+        }
+        sharedMemPerMultiprocessor = userSmemPreference;
+    }
+    // Compute total shared memory available per SM
+    //
+    smemAvailableForDynamic =  sharedMemPerMultiprocessor / numBlocks;
+    smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity;
+    // Cap shared memory
+    //
+    if (smemAvailableForDynamic > smemLimitPerBlock) {
+        smemAvailableForDynamic = smemLimitPerBlock;
+    }
+    // Now compute dynamic shared memory size
+    smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes;
+    // Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute()
+    //
+    if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes)
+        smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes;
+    *bytesAvailable = smemAvailableForDynamic;
+    return CUDA_OCC_SUCCESS;
+}
+static __OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    size_t                     (*blockSizeToDynamicSMemSize)(int),
+    size_t                       dynamicSMemSize)
+{
+    cudaOccError  status = CUDA_OCC_SUCCESS;
+    cudaOccResult result;
+    // Limits
+    int occupancyLimit;
+    int granularity;
+    int blockSizeLimit;
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+    if (!minGridSize || !blockSize || !properties || !attributes || !state) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+    occupancyLimit = properties->maxThreadsPerMultiprocessor;
+    granularity    = properties->warpSize;
+    blockSizeLimit        = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
+    blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
+        // Ignore dynamicSMemSize if the user provides a mapping
+        //
+        if (blockSizeToDynamicSMemSize) {
+            dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry);
+        }
+        status = cudaOccMaxActiveBlocksPerMultiprocessor(
+            &result,
+            properties,
+            attributes,
+            state,
+            blockSizeToTry,
+            dynamicSMemSize);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+        occupancyInBlocks = result.activeBlocksPerMultiprocessor;
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * properties->numSms;
+    *blockSize = maxBlockSize;
+    return status;
+}
+#if defined(__cplusplus)
+namespace {
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    size_t                       dynamicSMemSize)
+{
+    return cudaOccMaxPotentialOccupancyBlockSize(
+        minGridSize,
+        blockSize,
+        properties,
+        attributes,
+        state,
+        NULL,
+        dynamicSMemSize);
+}
+template <typename UnaryFunction>
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    UnaryFunction                blockSizeToDynamicSMemSize)
+{
+    cudaOccError  status = CUDA_OCC_SUCCESS;
+    cudaOccResult result;
+    // Limits
+    int occupancyLimit;
+    int granularity;
+    int blockSizeLimit;
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+    size_t dynamicSMemSize;
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+    if (!minGridSize || !blockSize || !properties || !attributes || !state) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+    occupancyLimit = properties->maxThreadsPerMultiprocessor;
+    granularity    = properties->warpSize;
+    blockSizeLimit        = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
+    blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
+        dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
+        status = cudaOccMaxActiveBlocksPerMultiprocessor(
+            &result,
+            properties,
+            attributes,
+            state,
+            blockSizeToTry,
+            dynamicSMemSize);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+        occupancyInBlocks = result.activeBlocksPerMultiprocessor;
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * properties->numSms;
+    *blockSize = maxBlockSize;
+    return status;
+}
+} // namespace anonymous
+#endif /*__cplusplus */
+#undef __OCC_INLINE
+#endif /*__cuda_occupancy_h__*/

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline_primitives.h ADDED Viewed

	@@ -0,0 +1,148 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CUDA_PIPELINE_PRIMITIVES_H_
+# define _CUDA_PIPELINE_PRIMITIVES_H_
+# include "cuda_pipeline_helpers.h"
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_memcpy_async(void* __restrict__ dst_shared, const void* __restrict__ src_global, size_t size_and_align,
+                             size_t zfill = 0)
+{
+    _CUDA_PIPELINE_ASSERT(size_and_align == 4 || size_and_align == 8 || size_and_align == 16);
+    _CUDA_PIPELINE_ASSERT(zfill <= size_and_align);
+    _CUDA_PIPELINE_ASSERT(__isShared(dst_shared));
+    _CUDA_PIPELINE_ASSERT(__isGlobal(src_global));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst_shared) & (size_and_align - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src_global) & (size_and_align - 1)));
+    switch (size_and_align) {
+    case 16:
+        switch (zfill) {
+        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 16>(dst_shared, src_global); return;
+        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 15>(dst_shared, src_global); return;
+        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 14>(dst_shared, src_global); return;
+        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 13>(dst_shared, src_global); return;
+        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 12>(dst_shared, src_global); return;
+        case  5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 11>(dst_shared, src_global); return;
+        case  6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 10>(dst_shared, src_global); return;
+        case  7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  9>(dst_shared, src_global); return;
+        case  8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  8>(dst_shared, src_global); return;
+        case  9: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  7>(dst_shared, src_global); return;
+        case 10: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  6>(dst_shared, src_global); return;
+        case 11: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  5>(dst_shared, src_global); return;
+        case 12: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  4>(dst_shared, src_global); return;
+        case 13: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  3>(dst_shared, src_global); return;
+        case 14: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  2>(dst_shared, src_global); return;
+        case 15: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  1>(dst_shared, src_global); return;
+        case 16: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16,  0>(dst_shared, src_global); return;
+        default: _CUDA_PIPELINE_ABORT();                                                                   return;
+        }
+    case 8:
+        switch (zfill) {
+        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  8>(dst_shared, src_global); return;
+        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  7>(dst_shared, src_global); return;
+        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  6>(dst_shared, src_global); return;
+        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  5>(dst_shared, src_global); return;
+        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  4>(dst_shared, src_global); return;
+        case  5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  3>(dst_shared, src_global); return;
+        case  6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  2>(dst_shared, src_global); return;
+        case  7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  1>(dst_shared, src_global); return;
+        case  8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8,  0>(dst_shared, src_global); return;
+        default: _CUDA_PIPELINE_ABORT();                                                                   return;
+        }
+    case 4:
+        switch (zfill) {
+        case  0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  4>(dst_shared, src_global); return;
+        case  1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  3>(dst_shared, src_global); return;
+        case  2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  2>(dst_shared, src_global); return;
+        case  3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  1>(dst_shared, src_global); return;
+        case  4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4,  0>(dst_shared, src_global); return;
+        default: _CUDA_PIPELINE_ABORT();                                                                   return;
+        }
+    default:
+        _CUDA_PIPELINE_ABORT();
+        return;
+    }
+}
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_commit()
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
+}
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_wait_prior(size_t prior)
+{
+    switch (prior) {
+    case  0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); return;
+    case  1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); return;
+    case  2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); return;
+    case  3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); return;
+    case  4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); return;
+    case  5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); return;
+    case  6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); return;
+    case  7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); return;
+    default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); return;
+    }
+}
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+#  include "cuda_awbarrier_primitives.h"
+_CUDA_PIPELINE_STATIC_QUALIFIER
+void __pipeline_arrive_on(__mbarrier_t* barrier)
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(barrier);
+}
+# endif
+#endif /* !_CUDA_PIPELINE_PRIMITIVES_H_ */

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti.h ADDED Viewed

	@@ -0,0 +1,123 @@

+/*
+ * Copyright 2010-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(_CUPTI_H_)
+#define _CUPTI_H_
+#ifdef _WIN32
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#ifdef NOMINMAX
+#include <windows.h>
+#else
+#define NOMINMAX
+#include <windows.h>
+#undef NOMINMAX
+#endif
+#endif
+#include <cuda.h>
+#include <cupti_result.h>
+#include <cupti_version.h>
+/* Activity, callback, event and metric APIs */
+#include <cupti_activity.h>
+#include <cupti_callbacks.h>
+#include <cupti_events.h>
+#include <cupti_metrics.h>
+/* Runtime, driver, and nvtx function identifiers */
+#include <cupti_driver_cbid.h>
+#include <cupti_runtime_cbid.h>
+#include <cupti_nvtx_cbid.h>
+/* To support function parameter structures for obsoleted API. See
+   cuda.h for the actual definition of these structures. */
+typedef unsigned int CUdeviceptr_v1;
+typedef struct CUDA_MEMCPY2D_v1_st { int dummy; } CUDA_MEMCPY2D_v1;
+typedef struct CUDA_MEMCPY3D_v1_st { int dummy; } CUDA_MEMCPY3D_v1;
+typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st { int dummy; } CUDA_ARRAY_DESCRIPTOR_v1;
+typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st { int dummy; } CUDA_ARRAY3D_DESCRIPTOR_v1;
+/* Function parameter structures */
+#include <generated_cuda_runtime_api_meta.h>
+#include <generated_cuda_meta.h>
+/* The following parameter structures cannot be included unless a
+   header that defines GL_VERSION is included before including them.
+   If these are needed then make sure such a header is included
+   already. */
+#ifdef GL_VERSION
+#include <generated_cuda_gl_interop_meta.h>
+#include <generated_cudaGL_meta.h>
+#endif
+//#include <generated_nvtx_meta.h>
+/* The following parameter structures cannot be included by default as
+   they are not guaranteed to be available on all systems. Uncomment
+   the includes that are available, or use the include explicitly. */
+#if defined(__linux__)
+//#include <generated_cuda_vdpau_interop_meta.h>
+//#include <generated_cudaVDPAU_meta.h>
+#endif
+#ifdef _WIN32
+//#include <generated_cuda_d3d9_interop_meta.h>
+//#include <generated_cuda_d3d10_interop_meta.h>
+//#include <generated_cuda_d3d11_interop_meta.h>
+//#include <generated_cudaD3D9_meta.h>
+//#include <generated_cudaD3D10_meta.h>
+//#include <generated_cudaD3D11_meta.h>
+#endif
+#endif /*_CUPTI_H_*/

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_events.h ADDED Viewed

	@@ -0,0 +1,1349 @@

+/*
+ * Copyright 2010-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(_CUPTI_EVENTS_H_)
+#define _CUPTI_EVENTS_H_
+#include <cuda.h>
+#include <string.h>
+#include <cuda_stdint.h>
+#include <cupti_result.h>
+#ifndef CUPTIAPI
+#ifdef _WIN32
+#define CUPTIAPI __stdcall
+#else
+#define CUPTIAPI
+#endif
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+/**
+ * \defgroup CUPTI_EVENT_API CUPTI Event API
+ * Functions, types, and enums that implement the CUPTI Event API.
+ *
+ * \note The CUPTI event API from the header cupti_events.h is not supported on devices
+ * with compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
+ * This API is deprecated in CUDA 12.8 release and will be removed in a future CUDA release.
+ * This is replaced by the host profiling API in the header cupti_profiler_host.h and
+ * target profiling API in the header cupti_range_profiler.h which are supported on
+ * devices with compute capability 7.0 and higher (i.e. Volta and later GPU architectures).
+ *
+ * @{
+ */
+/**
+ * \brief ID for an event.
+ *
+ * An event represents a countable activity, action, or occurrence on
+ * the device.
+ */
+typedef uint32_t CUpti_EventID;
+/**
+ * \brief ID for an event domain.
+ *
+ * ID for an event domain. An event domain represents a group of
+ * related events. A device may have multiple instances of a domain,
+ * indicating that the device can simultaneously record multiple
+ * instances of each event within that domain.
+ */
+typedef uint32_t CUpti_EventDomainID;
+/**
+ * \brief A group of events.
+ *
+ * An event group is a collection of events that are managed
+ * together. All events in an event group must belong to the same
+ * domain.
+ */
+typedef void *CUpti_EventGroup;
+/**
+ * \brief Device class.
+ *
+ * Enumeration of device classes for device attribute
+ * CUPTI_DEVICE_ATTR_DEVICE_CLASS.
+ */
+typedef enum {
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_TESLA              = 0,
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_QUADRO             = 1,
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_GEFORCE            = 2,
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS_TEGRA              = 3,
+} CUpti_DeviceAttributeDeviceClass;
+/**
+ * \brief Device attributes.
+ *
+ * CUPTI device attributes. These attributes can be read using \ref
+ * cuptiDeviceGetAttribute.
+ */
+typedef enum {
+  /**
+   * Number of event IDs for a device. Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_EVENT_ID                            = 1,
+  /**
+   * Number of event domain IDs for a device. Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_EVENT_DOMAIN_ID                     = 2,
+  /**
+   * Get global memory bandwidth in Kbytes/sec. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH                 = 3,
+  /**
+   * Get theoretical maximum number of instructions per cycle. Value
+   * is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_INSTRUCTION_PER_CYCLE                   = 4,
+  /**
+   * Get theoretical maximum number of single precision instructions
+   * that can be executed per second. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_INSTRUCTION_THROUGHPUT_SINGLE_PRECISION = 5,
+  /**
+   * Get number of frame buffers for device.  Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_FRAME_BUFFERS                       = 6,
+  /**
+   * Get PCIE link rate in Mega bits/sec for device. Return 0 if bus-type
+   * is non-PCIE. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_PCIE_LINK_RATE                          = 7,
+  /**
+   * Get PCIE link width for device. Return 0 if bus-type
+   * is non-PCIE. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH                         = 8,
+  /**
+   * Get PCIE generation for device. Return 0 if bus-type
+   * is non-PCIE. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_PCIE_GEN                                = 9,
+  /**
+   * Get the class for the device. Value is a
+   * CUpti_DeviceAttributeDeviceClass.
+   */
+  CUPTI_DEVICE_ATTR_DEVICE_CLASS                            = 10,
+  /**
+   * Get the peak single precision flop per cycle. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE                       = 11,
+  /**
+   * Get the peak double precision flop per cycle. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE                       = 12,
+  /**
+   * Get number of L2 units. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_L2_UNITS                           = 13,
+  /**
+   * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_SHARED
+   * preference. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_SHARED = 14,
+  /**
+   * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_L1
+   * preference. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_L1 = 15,
+  /**
+   * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_EQUAL
+   * preference. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_EQUAL = 16,
+  /**
+   * Get the peak half precision flop per cycle. Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE                       = 17,
+  /**
+   * Check if Nvlink is connected to device. Returns 1, if at least one
+   * Nvlink is connected to the device, returns 0 otherwise.
+   * Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_NVLINK_PRESENT                          = 18,
+    /**
+   * Check if Nvlink is present between GPU and CPU. Returns Bandwidth,
+   * in Bytes/sec, if Nvlink is present, returns 0 otherwise.
+   * Value is a uint64_t.
+   */
+  CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW                       = 19,
+  /**
+   * Check if NVSwitch is present in the underlying topology.
+   * Returns 1, if present, returns 0 otherwise.
+   * Value is a uint32_t.
+   */
+  CUPTI_DEVICE_ATTR_NVSWITCH_PRESENT                        = 20,
+  CUPTI_DEVICE_ATTR_FORCE_INT                               = 0x7fffffff,
+} CUpti_DeviceAttribute;
+/**
+ * \brief Event domain attributes.
+ *
+ * Event domain attributes. Except where noted, all the attributes can
+ * be read using either \ref cuptiDeviceGetEventDomainAttribute or
+ * \ref cuptiEventDomainGetAttribute.
+ */
+typedef enum {
+  /**
+   * Event domain name. Value is a null terminated const c-string.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_NAME                 = 0,
+  /**
+   * Number of instances of the domain for which event counts will be
+   * collected.  The domain may have additional instances that cannot
+   * be profiled (see CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT).
+   * Can be read only with \ref
+   * cuptiDeviceGetEventDomainAttribute. Value is a uint32_t.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT       = 1,
+  /**
+   * Total number of instances of the domain, including instances that
+   * cannot be profiled.  Use CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT
+   * to get the number of instances that can be profiled. Can be read
+   * only with \ref cuptiDeviceGetEventDomainAttribute. Value is a
+   * uint32_t.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT = 3,
+  /**
+   * Collection method used for events contained in the event domain.
+   * Value is a \ref CUpti_EventCollectionMethod.
+   */
+  CUPTI_EVENT_DOMAIN_ATTR_COLLECTION_METHOD    = 4,
+  CUPTI_EVENT_DOMAIN_ATTR_FORCE_INT      = 0x7fffffff,
+} CUpti_EventDomainAttribute;
+/**
+ * \brief The collection method used for an event.
+ *
+ * The collection method indicates how an event is collected.
+ */
+typedef enum {
+  /**
+   * Event is collected using a hardware global performance monitor.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_PM                  = 0,
+  /**
+   * Event is collected using a hardware SM performance monitor.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_SM                  = 1,
+  /**
+   * Event is collected using software instrumentation.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_INSTRUMENTED        = 2,
+  /**
+   * Event is collected using NvLink throughput counter method.
+   */
+  CUPTI_EVENT_COLLECTION_METHOD_NVLINK_TC           = 3,
+  CUPTI_EVENT_COLLECTION_METHOD_FORCE_INT           = 0x7fffffff
+} CUpti_EventCollectionMethod;
+/**
+ * \brief Event group attributes.
+ *
+ * Event group attributes. These attributes can be read using \ref
+ * cuptiEventGroupGetAttribute. Attributes marked [rw] can also be
+ * written using \ref cuptiEventGroupSetAttribute.
+ */
+typedef enum {
+  /**
+   * The domain to which the event group is bound. This attribute is
+   * set when the first event is added to the group.  Value is a
+   * CUpti_EventDomainID.
+   */
+  CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID              = 0,
+  /**
+   * [rw] Profile all the instances of the domain for this
+   * eventgroup. This feature can be used to get load balancing
+   * across all instances of a domain. Value is an integer.
+   */
+  CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES = 1,
+  /**
+   * [rw] Reserved for user data.
+   */
+  CUPTI_EVENT_GROUP_ATTR_USER_DATA                    = 2,
+  /**
+   * Number of events in the group. Value is a uint32_t.
+   */
+  CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS                   = 3,
+  /**
+   * Enumerates events in the group. Value is a pointer to buffer of
+   * size sizeof(CUpti_EventID) * num_of_events in the eventgroup.
+   * num_of_events can be queried using
+   * CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS.
+   */
+  CUPTI_EVENT_GROUP_ATTR_EVENTS                       = 4,
+  /**
+   * Number of instances of the domain bound to this event group that
+   * will be counted.  Value is a uint32_t.
+   */
+  CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT               = 5,
+  /**
+   * Event group scope can be set to CUPTI_EVENT_PROFILING_SCOPE_DEVICE or
+   * CUPTI_EVENT_PROFILING_SCOPE_CONTEXT for an eventGroup, before
+   * adding any event.
+   * Sets the scope of eventgroup as CUPTI_EVENT_PROFILING_SCOPE_DEVICE or
+   * CUPTI_EVENT_PROFILING_SCOPE_CONTEXT when the scope of the events
+   * that will be added is CUPTI_EVENT_PROFILING_SCOPE_BOTH.
+   * If profiling scope of event is either
+   * CUPTI_EVENT_PROFILING_SCOPE_DEVICE or CUPTI_EVENT_PROFILING_SCOPE_CONTEXT
+   * then setting this attribute will not affect the default scope.
+   * It is not allowed to add events of different scope to same eventgroup.
+   * Value is a uint32_t.
+   */
+  CUPTI_EVENT_GROUP_ATTR_PROFILING_SCOPE               = 6,
+  CUPTI_EVENT_GROUP_ATTR_FORCE_INT                     = 0x7fffffff,
+} CUpti_EventGroupAttribute;
+/**
+* \brief Profiling scope for event.
+*
+* Profiling scope of event indicates if the event can be collected at context
+* scope or device scope or both i.e. it can be collected at any of context or
+* device scope.
+*/
+typedef enum {
+  /**
+   * Event is collected at context scope.
+   */
+  CUPTI_EVENT_PROFILING_SCOPE_CONTEXT                 = 0,
+  /**
+   * Event is collected at device scope.
+   */
+  CUPTI_EVENT_PROFILING_SCOPE_DEVICE                  = 1,
+  /**
+   * Event can be collected at device or context scope.
+   * The scope can be set using \ref cuptiEventGroupSetAttribute API.
+   */
+  CUPTI_EVENT_PROFILING_SCOPE_BOTH                    = 2,
+  CUPTI_EVENT_PROFILING_SCOPE_FORCE_INT               = 0x7fffffff
+} CUpti_EventProfilingScope;
+/**
+ * \brief Event attributes.
+ *
+ * Event attributes. These attributes can be read using \ref
+ * cuptiEventGetAttribute.
+ */
+typedef enum {
+  /**
+   * Event name. Value is a null terminated const c-string.
+   */
+  CUPTI_EVENT_ATTR_NAME              = 0,
+  /**
+   * Short description of event. Value is a null terminated const
+   * c-string.
+   */
+  CUPTI_EVENT_ATTR_SHORT_DESCRIPTION = 1,
+  /**
+   * Long description of event. Value is a null terminated const
+   * c-string.
+   */
+  CUPTI_EVENT_ATTR_LONG_DESCRIPTION  = 2,
+  /**
+   * Category of event. Value is CUpti_EventCategory.
+   */
+  CUPTI_EVENT_ATTR_CATEGORY          = 3,
+  /**
+   * Profiling scope of the events. It can be either device or context or both.
+   * Value is a \ref CUpti_EventProfilingScope.
+   */
+  CUPTI_EVENT_ATTR_PROFILING_SCOPE   = 5,
+  CUPTI_EVENT_ATTR_FORCE_INT         = 0x7fffffff,
+} CUpti_EventAttribute;
+/**
+ * \brief Event collection modes.
+ *
+ * The event collection mode determines the period over which the
+ * events within the enabled event groups will be collected.
+ */
+typedef enum {
+  /**
+   * Events are collected for the entire duration between the
+   * cuptiEventGroupEnable and cuptiEventGroupDisable calls.
+   * Event values are reset when the events are read.
+   * For CUDA toolkit v6.0 and older this was the default mode.
+   */
+  CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS          = 0,
+  /**
+   * Events are collected only for the durations of kernel executions
+   * that occur between the cuptiEventGroupEnable and
+   * cuptiEventGroupDisable calls. Event collection begins when a
+   * kernel execution begins, and stops when kernel execution
+   * completes. Event values are reset to zero when each kernel
+   * execution begins. If multiple kernel executions occur between the
+   * cuptiEventGroupEnable and cuptiEventGroupDisable calls then the
+   * event values must be read after each kernel launch if those
+   * events need to be associated with the specific kernel launch.
+   * Note that collection in this mode may significantly change the
+   * overall performance characteristics of the application because
+   * kernel executions that occur between the cuptiEventGroupEnable and
+   * cuptiEventGroupDisable calls are serialized on the GPU.
+   * This is the default mode from CUDA toolkit v6.5
+   */
+  CUPTI_EVENT_COLLECTION_MODE_KERNEL              = 1,
+  CUPTI_EVENT_COLLECTION_MODE_FORCE_INT           = 0x7fffffff
+} CUpti_EventCollectionMode;
+/**
+ * \brief An event category.
+ *
+ * Each event is assigned to a category that represents the general
+ * type of the event. A event's category is accessed using \ref
+ * cuptiEventGetAttribute and the CUPTI_EVENT_ATTR_CATEGORY attribute.
+ */
+typedef enum {
+  /**
+   * An instruction related event.
+   */
+  CUPTI_EVENT_CATEGORY_INSTRUCTION     = 0,
+  /**
+   * A memory related event.
+   */
+  CUPTI_EVENT_CATEGORY_MEMORY          = 1,
+  /**
+   * A cache related event.
+   */
+  CUPTI_EVENT_CATEGORY_CACHE           = 2,
+  /**
+   * A profile-trigger event.
+   */
+  CUPTI_EVENT_CATEGORY_PROFILE_TRIGGER = 3,
+  /**
+   * A system event.
+   */
+  CUPTI_EVENT_CATEGORY_SYSTEM  = 4,
+  CUPTI_EVENT_CATEGORY_FORCE_INT       = 0x7fffffff
+} CUpti_EventCategory;
+/**
+ * \brief The overflow value for a CUPTI event.
+ *
+ * The CUPTI event value that indicates an overflow.
+ */
+#define CUPTI_EVENT_OVERFLOW ((uint64_t)0xFFFFFFFFFFFFFFFFULL)
+/**
+ * \brief The value that indicates the event value is invalid
+ */
+#define CUPTI_EVENT_INVALID ((uint64_t)0xFFFFFFFFFFFFFFFEULL)
+/**
+ * \brief Flags for cuptiEventGroupReadEvent an
+ * cuptiEventGroupReadAllEvents.
+ *
+ * Flags for \ref cuptiEventGroupReadEvent an \ref
+ * cuptiEventGroupReadAllEvents.
+ */
+typedef enum {
+  /**
+   * No flags.
+   */
+  CUPTI_EVENT_READ_FLAG_NONE          = 0,
+  CUPTI_EVENT_READ_FLAG_FORCE_INT     = 0x7fffffff,
+} CUpti_ReadEventFlags;
+/**
+ * \brief A set of event groups.
+ *
+ * A set of event groups. When returned by \ref
+ * cuptiEventGroupSetsCreate and \ref cuptiMetricCreateEventGroupSets
+ * a set indicates that event groups that can be enabled at the same
+ * time (i.e. all the events in the set can be collected
+ * simultaneously).
+ */
+typedef struct {
+  /**
+   * The number of event groups in the set.
+   */
+  uint32_t numEventGroups;
+  /**
+   * An array of \p numEventGroups event groups.
+   */
+  CUpti_EventGroup *eventGroups;
+} CUpti_EventGroupSet;
+/**
+ * \brief A set of event group sets.
+ *
+ * A set of event group sets. When returned by \ref
+ * cuptiEventGroupSetsCreate and \ref cuptiMetricCreateEventGroupSets
+ * a CUpti_EventGroupSets indicates the number of passes required to
+ * collect all the events, and the event groups that should be
+ * collected during each pass.
+ */
+typedef struct {
+  /**
+   * Number of event group sets.
+   */
+  uint32_t numSets;
+  /**
+   * An array of \p numSets event group sets.
+   */
+  CUpti_EventGroupSet *sets;
+} CUpti_EventGroupSets;
+/**
+ * \brief Set the event collection mode.
+ *
+ * Set the event collection mode for a \p context.  The \p mode
+ * controls the event collection behavior of all events in event
+ * groups created in the \p context. This API is invalid in kernel
+ * replay mode.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context
+ * \param mode The event collection mode
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_OPERATION if called when replay mode is enabled
+ * \retval CUPTI_ERROR_NOT_SUPPORTED if mode is not supported on the device
+ */
+CUptiResult CUPTIAPI cuptiSetEventCollectionMode(CUcontext context,
+                                                 CUpti_EventCollectionMode mode);
+/**
+ * \brief Read a device attribute.
+ *
+ * Read a device attribute and return it in \p *value.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param attrib The attribute to read
+ * \param valueSize Size of buffer pointed by the value, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the value of the attribute
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not a device attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device,
+                                             CUpti_DeviceAttribute attrib,
+                                             size_t *valueSize,
+                                             void *value);
+/**
+ * \brief Get the number of domains for a device.
+ *
+ * Returns the number of domains in \p numDomains for a device.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param numDomains Returns the number of domains
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numDomains is NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device,
+                                                   uint32_t *numDomains);
+/**
+ * \brief Get the event domains for a device.
+ *
+ * Returns the event domains IDs in \p domainArray for a device.  The
+ * size of the \p domainArray buffer is given by \p
+ * *arraySizeBytes. The size of the \p domainArray buffer must be at
+ * least \p numdomains * sizeof(CUpti_EventDomainID) or else all
+ * domains will not be returned. The value returned in \p
+ * *arraySizeBytes contains the number of bytes returned in \p
+ * domainArray.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param arraySizeBytes The size of \p domainArray in bytes, and
+ * returns the number of bytes written to \p domainArray
+ * \param domainArray Returns the IDs of the event domains for the device
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p domainArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(CUdevice device,
+                                                 size_t *arraySizeBytes,
+                                                 CUpti_EventDomainID *domainArray);
+/**
+ * \brief Read an event domain attribute.
+ *
+ * Returns an event domain attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param eventDomain ID of the event domain
+ * \param attrib The event domain attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event domain attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(CUdevice device,
+                                                        CUpti_EventDomainID eventDomain,
+                                                        CUpti_EventDomainAttribute attrib,
+                                                        size_t *valueSize,
+                                                        void *value);
+/**
+ * \brief Get the number of event domains available on any device.
+ *
+ * Returns the total number of event domains available on any
+ * CUDA-capable device.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param numDomains Returns the number of domains
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numDomains is NULL
+ */
+CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains);
+/**
+ * \brief Get the event domains available on any device.
+ *
+ * Returns all the event domains available on any CUDA-capable device.
+ * Event domain IDs are returned in \p domainArray. The size of the \p
+ * domainArray buffer is given by \p *arraySizeBytes. The size of the
+ * \p domainArray buffer must be at least \p numDomains *
+ * sizeof(CUpti_EventDomainID) or all domains will not be
+ * returned. The value returned in \p *arraySizeBytes contains the
+ * number of bytes returned in \p domainArray.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param arraySizeBytes The size of \p domainArray in bytes, and
+ * returns the number of bytes written to \p domainArray
+ * \param domainArray Returns all the event domains
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
+ * \p domainArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes,
+                                           CUpti_EventDomainID *domainArray);
+/**
+ * \brief Read an event domain attribute.
+ *
+ * Returns an event domain attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventDomain ID of the event domain
+ * \param attrib The event domain attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event domain attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(CUpti_EventDomainID eventDomain,
+                                                  CUpti_EventDomainAttribute attrib,
+                                                  size_t *valueSize,
+                                                  void *value);
+/**
+ * \brief Get number of events in a domain.
+ *
+ * Returns the number of events in \p numEvents for a domain.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventDomain ID of the event domain
+ * \param numEvents Returns the number of events in the domain
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(CUpti_EventDomainID eventDomain,
+                                                  uint32_t *numEvents);
+/**
+ * \brief Get the events in a domain.
+ *
+ * Returns the event IDs in \p eventArray for a domain.  The size of
+ * the \p eventArray buffer is given by \p *arraySizeBytes. The size
+ * of the \p eventArray buffer must be at least \p numdomainevents *
+ * sizeof(CUpti_EventID) or else all events will not be returned. The
+ * value returned in \p *arraySizeBytes contains the number of bytes
+ * returned in \p eventArray.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventDomain ID of the event domain
+ * \param arraySizeBytes The size of \p eventArray in bytes, and
+ * returns the number of bytes written to \p eventArray
+ * \param eventArray Returns the IDs of the events in the domain
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or \p
+ * eventArray are NULL
+ */
+CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
+                                                size_t *arraySizeBytes,
+                                                CUpti_EventID *eventArray);
+/**
+ * \brief Get an event attribute.
+ *
+ * Returns an event attribute in \p *value. The size of the \p
+ * value buffer is given by \p *valueSize. The value returned in \p
+ * *valueSize contains the number of bytes returned in \p value.
+ *
+ * If the attribute value is a c-string that is longer than \p
+ * *valueSize, then only the first \p *valueSize characters will be
+ * returned and there will be no terminating null byte.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param event ID of the event
+ * \param attrib The event attribute to read
+ * \param valueSize The size of the \p value buffer in bytes, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the attribute's value
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event,
+                                            CUpti_EventAttribute attrib,
+                                            size_t *valueSize,
+                                            void *value);
+/**
+ * \brief Find an event by name.
+ *
+ * Find an event by name and return the event ID in \p *event.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param device The CUDA device
+ * \param eventName The name of the event to find
+ * \param event Returns the ID of the found event or undefined if
+ * unable to find the event
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_DEVICE
+ * \retval CUPTI_ERROR_INVALID_EVENT_NAME if unable to find an event
+ * with name \p eventName. In this case \p *event is undefined
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventName or \p event are NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device,
+                                             const char *eventName,
+                                             CUpti_EventID *event);
+/**
+ * \brief Create a new event group for a context.
+ *
+ * Creates a new event group for \p context and returns the new group
+ * in \p *eventGroup.
+ * \note \p flags are reserved for future use and should be set to zero.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context for the event group
+ * \param eventGroup Returns the new event group
+ * \param flags Reserved - must be zero
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context,
+                                           CUpti_EventGroup *eventGroup,
+                                           uint32_t flags);
+/**
+ * \brief Destroy an event group.
+ *
+ * Destroy an \p eventGroup and free its resources. An event group
+ * cannot be destroyed if it is enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group to destroy
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if the event group is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup);
+/**
+ * \brief Read an event group attribute.
+ *
+ * Read an event group attribute and return it in \p *value.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.).
+ *
+ * \param eventGroup The event group
+ * \param attrib The attribute to read
+ * \param valueSize Size of buffer pointed by the value, and
+ * returns the number of bytes written to \p value
+ * \param value Returns the value of the attribute
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an eventgroup attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
+ * attribute values, indicates that the \p value buffer is too small
+ * to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(CUpti_EventGroup eventGroup,
+                                                 CUpti_EventGroupAttribute attrib,
+                                                 size_t *valueSize,
+                                                 void *value);
+/**
+ * \brief Write an event group attribute.
+ *
+ * Write an event group attribute.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ * \param attrib The attribute to write
+ * \param valueSize The size, in bytes, of the value
+ * \param value The attribute value to write
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
+ * is NULL, or if \p attrib is not an event group attribute, or if
+ * \p attrib is not a writable attribute
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
+ * the \p value buffer is too small to hold the attribute value.
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(CUpti_EventGroup eventGroup,
+                                                 CUpti_EventGroupAttribute attrib,
+                                                 size_t valueSize,
+                                                 void *value);
+/**
+ * \brief Add an event to an event group.
+ *
+ * Add an event to an event group. The event add can fail for a number of reasons:
+ * \li The event group is enabled
+ * \li The event does not belong to the same event domain as the
+ * events that are already in the event group
+ * \li Device limitations on the events that can belong to the same group
+ * \li The event group is full
+ *
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ * \param event The event to add to the group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p event belongs to a
+ * different event domain than the events already in \p eventGroup, or
+ * if a device limitation prevents \p event from being collected at
+ * the same time as the events already in \p eventGroup
+ * \retval CUPTI_ERROR_MAX_LIMIT_REACHED if \p eventGroup is full
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
+                                             CUpti_EventID event);
+/**
+ * \brief Remove an event from an event group.
+ *
+ * Remove \p event from the an event group. The event cannot be
+ * removed if the event group is enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ * \param event The event to remove from the group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
+                                                CUpti_EventID event);
+/**
+ * \brief Remove all events from an event group.
+ *
+ * Remove all events from an event group. Events cannot be removed if
+ * the event group is enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup);
+/**
+ * \brief Zero all the event counts in an event group.
+ *
+ * Zero all the event counts in an event group.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.).
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup);
+/**
+ * \brief Enable an event group.
+ *
+ * Enable an event group. Enabling an event group zeros the value of
+ * all the events in the group and then starts collection of those
+ * events.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_NOT_READY if \p eventGroup does not contain any events
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p eventGroup cannot be
+ * enabled due to other already enabled event groups
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ * \retval CUPTI_ERROR_HARDWARE_BUSY if another client is profiling
+ * and hardware is busy
+ */
+CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup);
+/**
+ * \brief Disable an event group.
+ *
+ * Disable an event group. Disabling an event group stops collection
+ * of events contained in the group.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroup The event group
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup);
+/**
+ * \brief Read the value for an event in an event group.
+ *
+ * Read the value for an event in an event group. The event value is
+ * returned in the \p eventValueBuffer buffer. \p
+ * eventValueBufferSizeBytes indicates the size of the \p
+ * eventValueBuffer buffer. The buffer must be at least sizeof(uint64)
+ * if ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is not set
+ * on the group containing the event.  The buffer must be at least
+ * (sizeof(uint64) * number of domain instances) if
+ * ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is set on the
+ * group.
+ *
+ * If any instance of an event counter overflows, the value returned
+ * for that event instance will be ::CUPTI_EVENT_OVERFLOW.
+ *
+ * The only allowed value for \p flags is ::CUPTI_EVENT_READ_FLAG_NONE.
+ *
+ * Reading an event from a disabled event group is not allowed. After
+ * being read, an event's value is reset to zero.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.). If \ref cuptiEventGroupResetAllEvents is
+ * called simultaneously with this function, then returned event
+ * values are undefined.
+ *
+ * \param eventGroup The event group
+ * \param flags Flags controlling the reading mode
+ * \param event The event to read
+ * \param eventValueBufferSizeBytes The size of \p eventValueBuffer
+ * in bytes, and returns the number of bytes written to \p
+ * eventValueBuffer
+ * \param eventValueBuffer Returns the event value(s)
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is disabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup, \p
+ * eventValueBufferSizeBytes or \p eventValueBuffer is NULL
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if size of \p eventValueBuffer
+ * is not sufficient
+ */
+CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
+                                              CUpti_ReadEventFlags flags,
+                                              CUpti_EventID event,
+                                              size_t *eventValueBufferSizeBytes,
+                                              uint64_t *eventValueBuffer);
+/**
+ * \brief Read the values for all the events in an event group.
+ *
+ * Read the values for all the events in an event group. The event
+ * values are returned in the \p eventValueBuffer buffer. \p
+ * eventValueBufferSizeBytes indicates the size of \p
+ * eventValueBuffer.  The buffer must be at least (sizeof(uint64) *
+ * number of events in group) if
+ * ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is not set on
+ * the group containing the events.  The buffer must be at least
+ * (sizeof(uint64) * number of domain instances * number of events in
+ * group) if ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is
+ * set on the group.
+ *
+ * The data format returned in \p eventValueBuffer is:
+ *    - domain instance 0: event0 event1 ... eventN
+ *    - domain instance 1: event0 event1 ... eventN
+ *    - ...
+ *    - domain instance M: event0 event1 ... eventN
+ *
+ * The event order in \p eventValueBuffer is returned in \p
+ * eventIdArray. The size of \p eventIdArray is specified in \p
+ * eventIdArraySizeBytes. The size should be at least
+ * (sizeof(CUpti_EventID) * number of events in group).
+ *
+ * If any instance of any event counter overflows, the value returned
+ * for that event instance will be ::CUPTI_EVENT_OVERFLOW.
+ *
+ * The only allowed value for \p flags is ::CUPTI_EVENT_READ_FLAG_NONE.
+ *
+ * Reading events from a disabled event group is not allowed. After
+ * being read, an event's value is reset to zero.
+ * \note \b Thread-safety: this function is thread safe but client
+ * must guard against simultaneous destruction or modification of \p
+ * eventGroup (for example, client must guard against simultaneous
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
+ * etc.), and must guard against simultaneous destruction of the
+ * context in which \p eventGroup was created (for example, client
+ * must guard against simultaneous calls to cudaDeviceReset,
+ * cuCtxDestroy, etc.). If \ref cuptiEventGroupResetAllEvents is
+ * called simultaneously with this function, then returned event
+ * values are undefined.
+ *
+ * \param eventGroup The event group
+ * \param flags Flags controlling the reading mode
+ * \param eventValueBufferSizeBytes The size of \p eventValueBuffer in
+ * bytes, and returns the number of bytes written to \p
+ * eventValueBuffer
+ * \param eventValueBuffer Returns the event values
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes,
+ * and returns the number of bytes written to \p eventIdArray
+ * \param eventIdArray Returns the IDs of the events in the same order
+ * as the values return in eventValueBuffer.
+ * \param numEventIdsRead Returns the number of event IDs returned
+ * in \p eventIdArray
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is disabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup, \p
+ * eventValueBufferSizeBytes, \p eventValueBuffer, \p
+ * eventIdArraySizeBytes, \p eventIdArray or \p numEventIdsRead is
+ * NULL
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if size of \p eventValueBuffer
+ * or \p eventIdArray is not sufficient
+ */
+CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(CUpti_EventGroup       eventGroup,
+                                                  CUpti_ReadEventFlags   flags,
+                                                  size_t                 *eventValueBufferSizeBytes,
+                                                  uint64_t               *eventValueBuffer,
+                                                  size_t                 *eventIdArraySizeBytes,
+                                                  CUpti_EventID          *eventIdArray,
+                                                  size_t                 *numEventIdsRead);
+/**
+ * \brief For a set of events, get the grouping that indicates the
+ * number of passes and the event groups necessary to collect the
+ * events.
+ *
+ * The number of events that can be collected simultaneously varies by
+ * device and by the type of the events. When events can be collected
+ * simultaneously, they may need to be grouped into multiple event
+ * groups because they are from different event domains. This function
+ * takes a set of events and determines how many passes are required
+ * to collect all those events, and which events can be collected
+ * simultaneously in each pass.
+ *
+ * The CUpti_EventGroupSets returned in \p eventGroupPasses indicates
+ * how many passes are required to collect the events with the \p
+ * numSets field. Within each event group set, the \p sets array
+ * indicates the event groups that should be collected on each pass.
+ * \note \b Thread-safety: this function is thread safe, but client
+ * must guard against another thread simultaneously destroying \p
+ * context.
+ *
+ * \param context The context for event collection
+ * \param eventIdArraySizeBytes Size of \p eventIdArray in bytes
+ * \param eventIdArray Array of event IDs that need to be grouped
+ * \param eventGroupPasses Returns a CUpti_EventGroupSets object that
+ * indicates the number of passes required to collect the events and
+ * the events to collect on each pass
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArray or
+ * \p eventGroupPasses is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(CUcontext context,
+                                               size_t eventIdArraySizeBytes,
+                                               CUpti_EventID *eventIdArray,
+                                               CUpti_EventGroupSets **eventGroupPasses);
+/**
+ * \brief Destroy a event group sets object.
+ *
+ * Destroy a CUpti_EventGroupSets object.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroupSets The object to destroy
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_INVALID_OPERATION if any of the event groups
+ * contained in the sets is enabled
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSets is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets);
+/**
+ * \brief Enable an event group set.
+ *
+ * Enable a set of event groups. Enabling a set of event groups zeros the value of
+ * all the events in all the groups and then starts collection of those events.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param eventGroupSet The pointer to the event group set
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_NOT_READY if \p eventGroup does not contain any events
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p eventGroup cannot be
+ * enabled due to other already enabled event groups
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSet is NULL
+ * \retval CUPTI_ERROR_HARDWARE_BUSY if other client is profiling and hardware is
+ * busy
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet);
+/**
+ * \brief Disable an event group set.
+ *
+ * Disable a set of event groups. Disabling a set of event groups
+ * stops collection of events contained in the groups.
+ * \note \b Thread-safety: this function is thread safe.
+ * \note \b If this call fails, some of the event groups in the set may be disabled
+ * and other event groups may remain enabled.
+ *
+ * \param eventGroupSet The pointer to the event group set
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
+ * \retval CUPTI_ERROR_HARDWARE
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSet is NULL
+ */
+CUptiResult CUPTIAPI cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet);
+/**
+ * \brief Enable kernel replay mode.
+ *
+ * Set profiling mode for the context to replay mode. In this mode,
+ * any number of events can be collected in one run of the kernel. The
+ * event collection mode will automatically switch to
+ * CUPTI_EVENT_COLLECTION_MODE_KERNEL.  In this mode, \ref
+ * cuptiSetEventCollectionMode will return
+ * CUPTI_ERROR_INVALID_OPERATION.
+ * \note \b Kernels might take longer to run if many events are enabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context
+ * \retval CUPTI_SUCCESS
+ */
+CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context);
+/**
+ * \brief Disable kernel replay mode.
+ *
+ * Set profiling mode for the context to non-replay (default)
+ * mode. Event collection mode will be set to
+ * CUPTI_EVENT_COLLECTION_MODE_KERNEL.  All previously enabled
+ * event groups and event group sets will be disabled.
+ * \note \b Thread-safety: this function is thread safe.
+ *
+ * \param context The context
+ * \retval CUPTI_SUCCESS
+ */
+CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context);
+/**
+ * \brief Function type for getting updates on kernel replay.
+ *
+ * \param kernelName The mangled kernel name
+ * \param numReplaysDone Number of replays done so far
+ * \param customData Pointer of any custom data passed in when subscribing
+ */
+typedef void (CUPTIAPI *CUpti_KernelReplayUpdateFunc)(
+    const char *kernelName,
+    int numReplaysDone,
+    void *customData);
+/**
+ * \brief Subscribe to kernel replay updates.
+ *
+ * When subscribed, the function pointer passed in will be called each time a
+ * kernel run is finished during kernel replay. Previously subscribed function
+ * pointer will be replaced. Pass in NULL as the function pointer unsubscribes
+ * the update.
+ *
+ * \param updateFunc The update function pointer
+ * \param customData Pointer to any custom data
+ * \retval CUPTI_SUCCESS
+ */
+CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(CUpti_KernelReplayUpdateFunc updateFunc, void *customData);
+/** @} */ /* END CUPTI_EVENT_API */
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+#if defined(__cplusplus)
+}
+#endif
+#endif /*_CUPTI_EVENTS_H_*/

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pcsampling.h ADDED Viewed

	@@ -0,0 +1,936 @@

+/*
+ * Copyright 2020-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(_CUPTI_PCSAMPLING_H_)
+#define _CUPTI_PCSAMPLING_H_
+#include <cuda.h>
+#include <stdint.h>
+#include <stddef.h>
+#include "cupti_result.h"
+#include "cupti_common.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility push(default)
+#endif
+/**
+ * \defgroup CUPTI_PCSAMPLING_API CUPTI PC Sampling API
+ * Functions, types, and enums that implement the CUPTI PC Sampling API.
+ * @{
+ */
+#ifndef CUPTI_PCSAMPLING_STRUCT_SIZE
+#define CUPTI_PCSAMPLING_STRUCT_SIZE(type_, lastfield_)                     (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
+#endif
+#ifndef CUPTI_STALL_REASON_STRING_SIZE
+#define CUPTI_STALL_REASON_STRING_SIZE                                            128
+#endif
+/**
+ * \brief PC Sampling collection mode
+ */
+typedef enum
+{
+  /**
+   * INVALID Value
+   */
+  CUPTI_PC_SAMPLING_COLLECTION_MODE_INVALID                   = 0,
+  /**
+   * Continuous mode. Kernels are not serialized in this mode.
+   */
+  CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS                = 1,
+  /**
+   * Serialized mode. Kernels are serialized in this mode.
+   */
+  CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED         = 2,
+} CUpti_PCSamplingCollectionMode;
+/**
+ * \brief PC Sampling stall reasons
+ */
+typedef struct PACKED_ALIGNMENT
+{
+  /**
+   * [r] Collected stall reason index
+   */
+  uint32_t pcSamplingStallReasonIndex;
+  /**
+   * [r] Number of times the PC was sampled with the stallReason.
+   */
+  uint32_t samples;
+} CUpti_PCSamplingStallReason;
+/**
+ * \brief PC Sampling data
+ */
+typedef struct PACKED_ALIGNMENT
+{
+  /**
+   * [w] Size of the data structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [r] Unique cubin id
+   */
+  uint64_t cubinCrc;
+  /**
+   * [r] PC offset
+   */
+  uint64_t pcOffset;
+  /**
+   * The function's unique symbol index in the module.
+   */
+  uint32_t functionIndex;
+  /**
+   * Padding
+   */
+  uint32_t pad;
+  /**
+   * [r] The function name. This name string might be shared across all the records
+   * including records from activity APIs representing the same function, and so it should not be
+   * modified or freed until post processing of all the records is done. Once done, it is user’s responsibility to
+   * free the memory using free() function.
+   */
+  char* functionName;
+  /**
+   * [r] Collected stall reason count
+   */
+  size_t stallReasonCount;
+  /**
+   * [r] Stall reason id
+   * Total samples
+   */
+  CUpti_PCSamplingStallReason *stallReason;
+  /**
+   * The correlation ID of the kernel to which this result is associated. Only valid for serialized mode of pc sampling collection.
+   * For continous mode of collection the correlationId will be set to 0.
+   */
+  uint32_t correlationId;
+} CUpti_PCSamplingPCData;
+/**
+ * \brief PC Sampling output data format
+ */
+typedef enum
+{
+    CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_INVALID          = 0,
+  /**
+   * HW buffer data will be parsed during collection of data
+   */
+    CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED           = 1,
+} CUpti_PCSamplingOutputDataFormat;
+/**
+ * \brief Collected PC Sampling data
+ *
+ */
+typedef struct PACKED_ALIGNMENT
+{
+  /**
+   * [w] Size of the data structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Number of PCs to be collected
+   */
+  size_t collectNumPcs;
+  /**
+   * [r] Number of samples collected across all PCs.
+   * It includes samples for user modules, samples for non-user kernels and dropped samples.
+   * It includes counts for all non selected stall reasons.
+   * CUPTI does not provide PC records for non-user kernels.
+   * CUPTI does not provide PC records for instructions for which all selected stall reason metrics counts are zero.
+   */
+  uint64_t totalSamples;
+  /**
+   * [r] Number of samples that were dropped by hardware due to backpressure/overflow.
+   */
+  uint64_t droppedSamples;
+  /**
+   * [r] Number of PCs collected
+   */
+  size_t totalNumPcs;
+  /**
+   * [r] Number of PCs available for collection
+   */
+  size_t remainingNumPcs;
+  /**
+   * [r] Unique identifier for each range.
+   * Data collected across multiple ranges in multiple buffers can be identified using range id.
+   */
+  uint64_t rangeId;
+  /**
+   * [r] Profiled PC data
+   * This data struct should have enough memory to collect number of PCs mentioned in \brief collectNumPcs
+   */
+  CUpti_PCSamplingPCData *pPcData;
+  /**
+   * [r] Number of samples collected across all non user kernels PCs.
+   * It includes samples for non-user kernels.
+   * It includes counts for all non selected stall reasons as well.
+   * CUPTI does not provide PC records for non-user kernels.
+   */
+  uint64_t nonUsrKernelsTotalSamples;
+  /**
+   * [r] Status of the hardware buffer.
+   * CUPTI returns the error code CUPTI_ERROR_OUT_OF_MEMORY when hardware buffer is full.
+   * When hardware buffer is full, user will get pc data as 0. To mitigate this issue, one or more of the below options can be tried:
+   * 1. Increase the hardware buffer size using the attribute CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE
+   * 2. Decrease the thread sleep span using the attribute CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_WORKER_THREAD_PERIODIC_SLEEP_SPAN
+   * 3. Decrease the sampling frequency using the attribute CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD
+   */
+  uint8_t hardwareBufferFull;
+} CUpti_PCSamplingData;
+/**
+ * \brief PC Sampling configuration attributes
+ *
+ * PC Sampling configuration attribute types. These attributes can be read
+ * using \ref cuptiPCSamplingGetConfigurationAttribute and can be written
+ * using \ref cuptiPCSamplingSetConfigurationAttribute. Attributes marked
+ * [r] can only be read using \ref cuptiPCSamplingGetConfigurationAttribute
+ * [w] can only be written using \ref cuptiPCSamplingSetConfigurationAttribute
+ * [rw] can be read using \ref cuptiPCSamplingGetConfigurationAttribute and
+ * written using \ref cuptiPCSamplingSetConfigurationAttribute
+ */
+typedef enum
+{
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_INVALID                            = 0,
+  /**
+   * [rw] Sampling period for PC Sampling.
+   * DEFAULT - CUPTI defined value based on number of SMs
+   * Valid values for the sampling
+   * periods are between 5 to 31 both inclusive. This will set the
+   * sampling period to (2^samplingPeriod) cycles.
+   * For e.g. for sampling period = 5 to 31, cycles = 32, 64, 128,..., 2^31
+   * Value is a uint32_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD                    = 1,
+  /**
+   * [w] Number of stall reasons to collect.
+   * DEFAULT - All stall reasons will be collected
+   * Value is a size_t
+   * [w] Stall reasons to collect
+   * DEFAULT - All stall reasons will be collected
+   * Input value should be a pointer pointing to array of stall reason indexes
+   * containing all the stall reason indexes to collect.
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON                       = 2,
+  /**
+   * [rw] Size of SW buffer for raw PC counter data downloaded from HW buffer
+   * DEFAULT - 1 MB, which can accommodate approximately 5500 PCs
+   * with all stall reasons
+   * Approximately it takes 16 Bytes (and some fixed size memory)
+   * to accommodate one PC with one stall reason
+   * For e.g. 1 PC with 1 stall reason = 32 Bytes
+   *          1 PC with 2 stall reason = 48 Bytes
+   *          1 PC with 4 stall reason = 96 Bytes
+   * Value is a size_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE                = 3,
+  /**
+   * [rw] Size of HW buffer in bytes
+   * DEFAULT - 512 MB
+   * If sampling period is too less, HW buffer can overflow
+   * and drop PC data
+   * Value is a size_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE               = 4,
+  /**
+   * [rw] PC Sampling collection mode
+   * DEFAULT - CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS
+   * Input value should be of type \ref CUpti_PCSamplingCollectionMode.
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE                    = 5,
+  /**
+   * [rw] Control over PC Sampling data collection range
+   * Default - 0
+   * 1 - Allows user to start and stop PC Sampling using APIs -
+   * \ref cuptiPCSamplingStart() - Start PC Sampling
+   * \ref cuptiPCSamplingStop() - Stop PC Sampling
+   * Value is a uint32_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL          = 6,
+  /**
+   * [w] Value for output data format
+   * Default - CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED
+   * Input value should be of type \ref CUpti_PCSamplingOutputDataFormat.
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT                 = 7,
+  /**
+   * [w] Data buffer to hold collected PC Sampling data PARSED_DATA
+   * Default - none.
+   * Buffer type is void * which can point to PARSED_DATA
+   * Refer \ref CUpti_PCSamplingData for buffer format for PARSED_DATA
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER               = 8,
+  /**
+   * [rw] Control sleep time of the worker threads created by CUPTI for various PC sampling operations.
+   * CUPTI creates multiple worker threads to offload certain operations to these threads. This includes decoding of HW data to
+   * the CUPTI PC sampling data and correlating PC data to SASS instructions. CUPTI wakes up these threads periodically.
+   * Default - 100 milliseconds.
+   * Value is a uint32_t
+   */
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_WORKER_THREAD_PERIODIC_SLEEP_SPAN  = 9,
+  CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_FORCE_INT                          = 0x7fffffff,
+} CUpti_PCSamplingConfigurationAttributeType;
+/**
+ * \brief PC sampling configuration information structure
+ *
+ * This structure provides \ref CUpti_PCSamplingConfigurationAttributeType which can be configured
+ * or queried for PC sampling configuration
+ */
+typedef struct
+{
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationAttributeType for all supported attribute types
+   */
+  CUpti_PCSamplingConfigurationAttributeType attributeType;
+  /*
+   * Configure or query status for \p attributeType
+   * CUPTI_SUCCESS for valid \p attributeType and \p attributeData
+   * CUPTI_ERROR_INVALID_OPERATION if \p attributeData is not valid
+   * CUPTI_ERROR_INVALID_PARAMETER if \p attributeType is not valid
+   */
+  CUptiResult attributeStatus;
+  union
+  {
+    /**
+     * Invalid Value
+     */
+    struct
+    {
+      uint64_t data[3];
+    } invalidData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD
+     */
+    struct
+    {
+      uint32_t samplingPeriod;
+    } samplingPeriodData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON
+     */
+    struct
+    {
+      size_t stallReasonCount;
+      uint32_t *pStallReasonIndex;
+    } stallReasonData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE
+     */
+    struct
+    {
+      size_t scratchBufferSize;
+    } scratchBufferSizeData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE
+     */
+    struct
+    {
+      size_t hardwareBufferSize;
+    } hardwareBufferSizeData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE
+     */
+    struct
+    {
+      CUpti_PCSamplingCollectionMode collectionMode;
+    } collectionModeData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL
+     */
+    struct
+    {
+      uint32_t enableStartStopControl;
+    } enableStartStopControlData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT
+     */
+    struct
+    {
+      CUpti_PCSamplingOutputDataFormat outputDataFormat;
+    } outputDataFormatData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER
+     */
+    struct
+    {
+      void *samplingDataBuffer;
+    } samplingDataBufferData;
+    /**
+     * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_WORKER_THREAD_PERIODIC_SLEEP_SPAN
+     */
+    struct
+    {
+      uint32_t workerThreadPeriodicSleepSpan;
+    } workerThreadPeriodicSleepSpanData;
+  } attributeData;
+} CUpti_PCSamplingConfigurationInfo;
+/**
+ * \brief PC sampling configuration structure
+ *
+ * This structure configures PC sampling using \ref cuptiPCSamplingSetConfigurationAttribute
+ * and queries PC sampling default configuration using \ref cuptiPCSamplingGetConfigurationAttribute
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingConfigurationInfoParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * [w] Number of attributes to configure using \ref cuptiPCSamplingSetConfigurationAttribute or query
+   * using \ref cuptiPCSamplingGetConfigurationAttribute
+   */
+  size_t numAttributes;
+  /**
+   * Refer \ref CUpti_PCSamplingConfigurationInfo
+   */
+  CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
+} CUpti_PCSamplingConfigurationInfoParams;
+#define CUpti_PCSamplingConfigurationInfoParamsSize                 CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingConfigurationInfoParams,pPCSamplingConfigurationInfo)
+/**
+ * \brief Write PC Sampling configuration attribute.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingConfigurationInfoParams
+ * containing PC sampling configuration.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * some invalid \p attrib.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if attribute \p value is not valid
+ * or any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingSetConfigurationAttribute(CUpti_PCSamplingConfigurationInfoParams *pParams);
+/**
+ * \brief Read PC Sampling configuration attribute.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingConfigurationInfoParams
+ * containing PC sampling configuration.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * some invalid attribute.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p attrib is not valid
+ * or any \p pParams is not valid
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT indicates that
+ * the \p value buffer is too small to hold the attribute value
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetConfigurationAttribute(CUpti_PCSamplingConfigurationInfoParams *pParams);
+/**
+ * \brief Params for cuptiPCSamplingEnable
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingGetDataParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * \param pcSamplingData Data buffer to hold collected PC Sampling data PARSED_DATA
+   * Buffer type is void * which can point to PARSED_DATA
+   * Refer \ref CUpti_PCSamplingData for buffer format for PARSED_DATA
+   */
+  void *pcSamplingData;
+} CUpti_PCSamplingGetDataParams;
+#define CUpti_PCSamplingGetDataParamsSize                           CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetDataParams, pcSamplingData)
+/**
+ * \brief Flush GPU PC sampling data periodically.
+ *
+ * Flushing of GPU PC Sampling data is required at following point to maintain uniqueness of PCs:
+ * For \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS, after every module load-unload-load
+ * For \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED, after every kernel ends
+ * If configuration option \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL
+ * is enabled, then after every range end i.e. \brief cuptiPCSamplingStop()
+ *
+ * If application is profiled in \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS, with disabled
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL, and there is no module unload,
+ * user can collect data in two ways:
+ * Use \brief cuptiPCSamplingGetData() API periodically
+ * Use \brief cuptiPCSamplingDisable() on application exit and read GPU PC sampling data from sampling
+ * data buffer passed during configuration.
+ * Note: In case, \brief cuptiPCSamplingGetData() API is not called periodically, then sampling data buffer
+ * passed during configuration should be large enough to hold all PCs data.
+ *       \brief cuptiPCSamplingGetData() API never does device synchronization.
+ *       It is possible that when the API is called there is some unconsumed data from the HW buffer. In this case
+ * CUPTI provides only the data available with it at that moment.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingGetDataParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called without
+ * enabling PC sampling.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY indicates that the HW buffer is full
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetData(CUpti_PCSamplingGetDataParams *pParams);
+/**
+ * \brief Params for cuptiPCSamplingEnable
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingEnableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingEnableParams;
+#define CUpti_PCSamplingEnableParamsSize                           CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingEnableParams, ctx)
+/**
+ * \brief Enable PC sampling.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingEnableParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingEnable(CUpti_PCSamplingEnableParams *pParams);
+/**
+ * \brief Params for cuptiPCSamplingDisable
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingDisableParams;
+#define CUpti_PCSamplingDisableParamsSize                           CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingDisableParams, ctx)
+/**
+ * \brief Disable PC sampling.
+ *
+ * For application which doesn't destroy the CUDA context explicitly,
+ * this API does the PC Sampling tear-down, joins threads and copies PC records in the buffer provided
+ * during the PC sampling configuration. PC records which can't be accommodated in the buffer are discarded.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingDisableParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingDisable(CUpti_PCSamplingDisableParams *pParams);
+/**
+ * \brief Params for cuptiPCSamplingStart
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingStartParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingStartParams;
+#define CUpti_PCSamplingStartParamsSize                             CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingStartParams, ctx)
+/**
+ * \brief Start PC sampling.
+ *
+ * User can collect PC Sampling data for user-defined range specified by Start/Stop APIs.
+ * This API can be used to mark starting of range. Set configuration option
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL to use this API.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingStartParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * incorrect PC Sampling configuration.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingStart(CUpti_PCSamplingStartParams *pParams);
+/**
+ * \brief Params for cuptiPCSamplingStop
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingStopParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+} CUpti_PCSamplingStopParams;
+#define CUpti_PCSamplingStopParamsSize                              CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingStopParams, ctx)
+/**
+ * \brief Stop PC sampling.
+ *
+ * User can collect PC Sampling data for user-defined range specified by Start/Stop APIs.
+ * This API can be used to mark end of range. Set configuration option
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL to use this API.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingStopParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
+ * incorrect PC Sampling configuration.
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingStop(CUpti_PCSamplingStopParams *pParams);
+/**
+ * \brief Params for cuptiPCSamplingGetNumStallReasons
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingGetNumStallReasonsParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * [r] Number of stall reasons
+   */
+  size_t *numStallReasons;
+} CUpti_PCSamplingGetNumStallReasonsParams;
+#define CUpti_PCSamplingGetNumStallReasonsParamsSize                CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetNumStallReasonsParams, numStallReasons)
+/**
+ * \brief Get PC sampling stall reason count.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingGetNumStallReasonsParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetNumStallReasons(CUpti_PCSamplingGetNumStallReasonsParams *pParams);
+/**
+ * \brief Params for cuptiPCSamplingGetStallReasons
+ */
+typedef struct
+{
+  /**
+   * [w] Size of the data structure i.e. CUpti_PCSamplingGetStallReasonsParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Assign to NULL
+   */
+  void* pPriv;
+  /**
+   * [w] CUcontext
+   */
+  CUcontext ctx;
+  /**
+   * [w] Number of stall reasons
+   */
+  size_t numStallReasons;
+  /**
+   * [r] Stall reason index
+   */
+  uint32_t *stallReasonIndex;
+  /**
+   * [r] Stall reasons name
+   */
+  char **stallReasons;
+} CUpti_PCSamplingGetStallReasonsParams;
+#define CUpti_PCSamplingGetStallReasonsParamsSize                   CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetStallReasonsParams, stallReasons)
+/**
+ * \brief Get PC sampling stall reasons.
+ *
+ * \param pParams A pointer to \ref CUpti_PCSamplingGetStallReasonsParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
+ * does not support the API
+ */
+CUptiResult CUPTIAPI cuptiPCSamplingGetStallReasons(CUpti_PCSamplingGetStallReasonsParams *pParams);
+/**
+ * \brief Params for cuptiGetSassToSourceCorrelation
+ */
+typedef struct CUpti_GetSassToSourceCorrelationParams {
+  /**
+   * [w] Size of the data structure i.e. CUpti_GetSassToSourceCorrelationParamsSize
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Pointer to cubin binary where function belongs.
+   */
+  const void* cubin;
+  /**
+   * [w] Function name to which PC belongs.
+   */
+  const char *functionName;
+  /**
+   * [w] Size of cubin binary.
+   */
+  size_t cubinSize;
+  /**
+   * [r] Line number in the source code.
+   */
+  uint32_t lineNumber;
+  /**
+   * [w] PC offset
+   */
+  uint64_t pcOffset;
+  /**
+   * [r] Path for the source file.
+   */
+  char *fileName;
+  /**
+   * [r] Path for the directory of source file.
+   */
+  char *dirName;
+} CUpti_GetSassToSourceCorrelationParams;
+#define CUpti_GetSassToSourceCorrelationParamsSize     CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_GetSassToSourceCorrelationParams, dirName)
+/**
+ * \brief SASS to Source correlation.
+ *
+ * \param pParams A pointer to \ref CUpti_GetSassToSourceCorrelationParams
+ *
+ * It is expected from user to free allocated memory for fileName and dirName after use.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if either of the parameters cubin or functionName
+ * is NULL or cubinSize is zero or size field is not set correctly.
+ * \retval CUPTI_ERROR_INVALID_MODULE provided cubin is invalid.
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred.
+ * This error code is also used for cases when the function is not present in the module.
+ * A better error code will be returned in the future release.
+ */
+CUptiResult CUPTIAPI cuptiGetSassToSourceCorrelation(CUpti_GetSassToSourceCorrelationParams *pParams);
+/**
+ * \brief Params for cuptiGetCubinCrc
+ */
+typedef struct {
+  /**
+   * [w] Size of configuration structure.
+   * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
+   * available in the structure. Used to preserve backward compatibility.
+   */
+  size_t size;
+  /**
+   * [w] Size of cubin binary.
+   */
+  size_t cubinSize;
+  /**
+   * [w] Pointer to cubin binary
+   */
+  const void* cubin;
+  /**
+   * [r] Computed CRC will be stored in it.
+   */
+  uint64_t cubinCrc;
+} CUpti_GetCubinCrcParams;
+#define CUpti_GetCubinCrcParamsSize     CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_GetCubinCrcParams, cubinCrc)
+/**
+ * \brief Get the CRC of cubin.
+ *
+ * This function returns the CRC of provided cubin binary.
+ *
+ * \param pParams A pointer to \ref CUpti_GetCubinCrcParams
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if parameter cubin is NULL or
+ * provided cubinSize is zero or size field is not set.
+ */
+CUptiResult CUPTIAPI cuptiGetCubinCrc(CUpti_GetCubinCrcParams *pParams);
+/**
+ * \brief Function type for callback used by CUPTI to request crc of
+ * loaded module.
+ *
+ * This callback function ask for crc of provided module in function.
+ * The provided crc will be stored in PC sampling records i.e. in the field 'cubinCrc' of the PC sampling
+ * struct CUpti_PCSamplingPCData. The CRC is uses during the offline source correlation to uniquely identify the module.
+ *
+ * \param cubin The pointer to cubin binary
+ * \param cubinSize The size of cubin binary.
+ * \param cubinCrc Returns the computed crc of cubin.
+ */
+typedef void (CUPTIAPI *CUpti_ComputeCrcCallbackFunc)(
+    const void* cubin,
+    size_t cubinSize,
+    uint64_t *cubinCrc);
+/**
+ * \brief Register callback function with CUPTI to use
+ * your own algorithm to compute cubin crc.
+ *
+ * This function registers a callback function and it gets called
+ * from CUPTI when a CUDA module is loaded.
+ *
+ * \param funcComputeCubinCrc callback is invoked when a CUDA module
+ * is loaded.
+ *
+ * \retval CUPTI_SUCCESS
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p funcComputeCubinCrc is NULL.
+ */
+CUptiResult CUPTIAPI cuptiRegisterComputeCrcCallback(CUpti_ComputeCrcCallbackFunc funcComputeCubinCrc);
+/** @} */ /* END CUPTI_PCSAMPLING_API */
+#if defined(__GNUC__) && defined(CUPTI_LIB)
+    #pragma GCC visibility pop
+#endif
+#if defined(__cplusplus)
+}
+#endif
+#endif /*_CUPTI_PCSAMPLING_H_*/

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_runtime_cbid.h ADDED Viewed

	@@ -0,0 +1,504 @@

+// *************************************************************************
+//      Definitions of indices for API functions, unique across entire API
+// *************************************************************************
+// This file is generated.  Any changes you make will be lost during the next clean build.
+// CUDA public interface, for type definitions and cu* function prototypes
+#if !defined(_CUPTI_RUNTIME_CBID_H)
+#define _CUPTI_RUNTIME_CBID_H
+typedef enum CUpti_runtime_api_trace_cbid_enum {
+    CUPTI_RUNTIME_TRACE_CBID_INVALID                                                       = 0,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDriverGetVersion_v3020                                    = 1,
+    CUPTI_RUNTIME_TRACE_CBID_cudaRuntimeGetVersion_v3020                                   = 2,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceCount_v3020                                      = 3,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v3020                                 = 4,
+    CUPTI_RUNTIME_TRACE_CBID_cudaChooseDevice_v3020                                        = 5,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetChannelDesc_v3020                                      = 6,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateChannelDesc_v3020                                   = 7,
+    CUPTI_RUNTIME_TRACE_CBID_cudaConfigureCall_v3020                                       = 8,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020                                       = 9,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020                                        = 10,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeekAtLastError_v3020                                     = 11,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorString_v3020                                      = 12,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020                                              = 13,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetCacheConfig_v3020                                  = 14,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetAttributes_v3020                                   = 15,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020                                           = 16,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020                                           = 17,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetValidDevices_v3020                                     = 18,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDeviceFlags_v3020                                      = 19,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc_v3020                                              = 20,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocPitch_v3020                                         = 21,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFree_v3020                                                = 22,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocArray_v3020                                         = 23,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeArray_v3020                                           = 24,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocHost_v3020                                          = 25,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeHost_v3020                                            = 26,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostAlloc_v3020                                           = 27,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostGetDevicePointer_v3020                                = 28,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostGetFlags_v3020                                        = 29,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemGetInfo_v3020                                          = 30,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020                                              = 31,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_v3020                                            = 32,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_v3020                                       = 33,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_v3020                                     = 34,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_v3020                                     = 35,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_v3020                                   = 36,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_v3020                                  = 37,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_v3020                                = 38,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_v3020                                      = 39,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_v3020                                    = 40,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020                                         = 41,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_v3020                                  = 42,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_v3020                                = 43,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_v3020                                       = 44,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_v3020                                = 45,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_v3020                              = 46,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_v3020                                 = 47,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_v3020                               = 48,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020                                              = 49,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_v3020                                            = 50,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020                                         = 51,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_v3020                                       = 52,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolAddress_v3020                                    = 53,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolSize_v3020                                       = 54,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture_v3020                                         = 55,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture2D_v3020                                       = 56,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToArray_v3020                                  = 57,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUnbindTexture_v3020                                       = 58,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureAlignmentOffset_v3020                           = 59,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureReference_v3020                                 = 60,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindSurfaceToArray_v3020                                  = 61,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceReference_v3020                                 = 62,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLSetGLDevice_v3020                                       = 63,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLRegisterBufferObject_v3020                              = 64,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObject_v3020                                   = 65,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObject_v3020                                 = 66,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnregisterBufferObject_v3020                            = 67,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLSetBufferObjectMapFlags_v3020                           = 68,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObjectAsync_v3020                              = 69,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObjectAsync_v3020                            = 70,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWGLGetDevice_v3020                                        = 71,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterImage_v3020                             = 72,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterBuffer_v3020                            = 73,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnregisterResource_v3020                          = 74,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceSetMapFlags_v3020                         = 75,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsMapResources_v3020                                = 76,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnmapResources_v3020                              = 77,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedPointer_v3020                    = 78,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsSubResourceGetMappedArray_v3020                   = 79,
+    CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUGetDevice_v3020                                      = 80,
+    CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUSetVDPAUDevice_v3020                                 = 81,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterVideoSurface_v3020                   = 82,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterOutputSurface_v3020                  = 83,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevice_v3020                                      = 84,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevices_v3020                                     = 85,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11SetDirect3DDevice_v3020                              = 86,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D11RegisterResource_v3020                       = 87,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevice_v3020                                      = 88,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevices_v3020                                     = 89,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10SetDirect3DDevice_v3020                              = 90,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D10RegisterResource_v3020                       = 91,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10RegisterResource_v3020                               = 92,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnregisterResource_v3020                             = 93,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10MapResources_v3020                                   = 94,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnmapResources_v3020                                 = 95,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceSetMapFlags_v3020                            = 96,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetSurfaceDimensions_v3020                   = 97,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedArray_v3020                         = 98,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPointer_v3020                       = 99,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedSize_v3020                          = 100,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPitch_v3020                         = 101,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevice_v3020                                       = 102,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevices_v3020                                      = 103,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9SetDirect3DDevice_v3020                               = 104,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDirect3DDevice_v3020                               = 105,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D9RegisterResource_v3020                        = 106,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterResource_v3020                                = 107,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterResource_v3020                              = 108,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapResources_v3020                                    = 109,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapResources_v3020                                  = 110,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceSetMapFlags_v3020                             = 111,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetSurfaceDimensions_v3020                    = 112,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedArray_v3020                          = 113,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPointer_v3020                        = 114,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedSize_v3020                           = 115,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPitch_v3020                          = 116,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9Begin_v3020                                           = 117,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9End_v3020                                             = 118,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterVertexBuffer_v3020                            = 119,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterVertexBuffer_v3020                          = 120,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapVertexBuffer_v3020                                 = 121,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapVertexBuffer_v3020                               = 122,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadExit_v3020                                          = 123,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForDevice_v3020                                  = 124,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForHost_v3020                                    = 125,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSynchronize_v3020                                   = 126,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetLimit_v3020                                      = 127,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetLimit_v3020                                      = 128,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreate_v3020                                        = 129,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v3020                                       = 130,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_v3020                                   = 131,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_v3020                                         = 132,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020                                         = 133,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020                                = 134,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_v3020                                         = 135,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020                                        = 136,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020                                    = 137,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventQuery_v3020                                          = 138,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventElapsedTime_v3020                                    = 139,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3D_v3020                                            = 140,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3DArray_v3020                                       = 141,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_v3020                                            = 142,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_v3020                                       = 143,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_v3020                                            = 144,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_v3020                                       = 145,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetCacheConfig_v3020                                = 146,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_v3020                                     = 147,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDirect3DDevice_v3020                              = 148,
+    CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDirect3DDevice_v3020                              = 149,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetCacheConfig_v3020                                = 150,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPointerGetAttributes_v4000                                = 151,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostRegister_v4000                                        = 152,
+    CUPTI_RUNTIME_TRACE_CBID_cudaHostUnregister_v4000                                      = 153,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceCanAccessPeer_v4000                                 = 154,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceEnablePeerAccess_v4000                              = 155,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceDisablePeerAccess_v4000                             = 156,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerRegister_v4000                                        = 157,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerUnregister_v4000                                      = 158,
+    CUPTI_RUNTIME_TRACE_CBID_cudaPeerGetDevicePointer_v4000                                = 159,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeer_v4000                                          = 160,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeerAsync_v4000                                     = 161,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_v4000                                        = 162,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_v4000                                   = 163,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020                                         = 164,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020                                   = 165,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetLimit_v3020                                      = 166,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetLimit_v3020                                      = 167,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetCacheConfig_v3020                                = 168,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetCacheConfig_v3020                                = 169,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerInitialize_v4000                                  = 170,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStart_v4000                                       = 171,
+    CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStop_v4000                                        = 172,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetByPCIBusId_v4010                                 = 173,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetPCIBusId_v4010                                   = 174,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGLGetDevices_v4010                                        = 175,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetEventHandle_v4010                                   = 176,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenEventHandle_v4010                                  = 177,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetMemHandle_v4010                                     = 178,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenMemHandle_v4010                                    = 179,
+    CUPTI_RUNTIME_TRACE_CBID_cudaIpcCloseMemHandle_v4010                                   = 180,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetInfo_v4010                                        = 181,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetSharedMemConfig_v4020                              = 182,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetSharedMemConfig_v4020                            = 183,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetSharedMemConfig_v4020                            = 184,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateTextureObject_v5000                                 = 185,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyTextureObject_v5000                                = 186,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceDesc_v5000                        = 187,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectTextureDesc_v5000                         = 188,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateSurfaceObject_v5000                                 = 189,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroySurfaceObject_v5000                                = 190,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceObjectResourceDesc_v5000                        = 191,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocMipmappedArray_v5000                                = 192,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetMipmappedArrayLevel_v5000                              = 193,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeMipmappedArray_v5000                                  = 194,
+    CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToMipmappedArray_v5000                         = 195,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedMipmappedArray_v5000             = 196,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_v5000                                   = 197,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithFlags_v5000                               = 198,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceViewDesc_v5000                    = 199,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetAttribute_v5000                                  = 200,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v5050                                       = 201,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithPriority_v5050                            = 202,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_v5050                                   = 203,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_v5050                                      = 204,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetStreamPriorityRange_v5050                        = 205,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocManaged_v6000                                       = 206,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000           = 207,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_v6000                                = 208,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorName_v6050                                        = 209,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050           = 210,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000                                        = 211,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceFlags_v7000                                      = 212,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_ptsz_v7000                                         = 213,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000                                   = 214,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_ptds_v7000                                         = 215,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_ptds_v7000                                       = 216,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_ptds_v7000                                  = 217,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_ptds_v7000                                = 218,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_ptds_v7000                                = 219,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_ptds_v7000                              = 220,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_ptds_v7000                             = 221,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_ptds_v7000                           = 222,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_ptds_v7000                                 = 223,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_ptds_v7000                               = 224,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_ptsz_v7000                                    = 225,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_ptsz_v7000                             = 226,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_ptsz_v7000                           = 227,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_ptsz_v7000                                  = 228,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_ptsz_v7000                           = 229,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_ptsz_v7000                         = 230,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_ptsz_v7000                            = 231,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_ptsz_v7000                          = 232,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset_ptds_v7000                                         = 233,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_ptds_v7000                                       = 234,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_ptsz_v7000                                    = 235,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_ptsz_v7000                                  = 236,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_ptsz_v7000                              = 237,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_ptsz_v7000                                 = 238,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_ptsz_v7000                              = 239,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_ptsz_v7000                                    = 240,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_ptsz_v7000                           = 241,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_ptsz_v7000                                    = 242,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_ptds_v7000                                       = 243,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_ptsz_v7000                                  = 244,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_ptds_v7000                                       = 245,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_ptsz_v7000                                  = 246,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_ptsz_v7000                                = 247,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_ptsz_v7000                              = 248,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_ptds_v7000                                   = 249,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_ptsz_v7000                              = 250,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000  = 251,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v8000                                    = 252,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_ptsz_v8000                               = 253,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemAdvise_v8000                                           = 254,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetP2PAttribute_v8000                               = 255,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsEGLRegisterImage_v7000                            = 256,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnect_v7000                            = 257,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerDisconnect_v7000                         = 258,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerAcquireFrame_v7000                       = 259,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerReleaseFrame_v7000                       = 260,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerConnect_v7000                            = 261,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerDisconnect_v7000                         = 262,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerPresentFrame_v7000                       = 263,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerReturnFrame_v7000                        = 264,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedEglFrame_v7000                   = 265,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttribute_v8000                                = 266,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttributes_v8000                               = 267,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnectWithFlags_v7000                   = 268,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000                             = 269,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_ptsz_v9000                        = 270,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateFromEGLSync_v9000                              = 271,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000                  = 272,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetAttribute_v9000                                    = 273,
+    CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalMemory_v10000                               = 274,
+    CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedBuffer_v10000                      = 275,
+    CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedMipmappedArray_v10000              = 276,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalMemory_v10000                              = 277,
+    CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalSemaphore_v10000                            = 278,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v10000                      = 279,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_ptsz_v10000                 = 280,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v10000                        = 281,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_ptsz_v10000                   = 282,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalSemaphore_v10000                           = 283,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_v10000                                     = 284,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_ptsz_v10000                                = 285,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphCreate_v10000                                        = 286,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetParams_v10000                           = 287,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetParams_v10000                           = 288,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddKernelNode_v10000                                 = 289,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode_v10000                                 = 290,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeGetParams_v10000                           = 291,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams_v10000                           = 292,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemsetNode_v10000                                 = 293,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeGetParams_v10000                           = 294,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeSetParams_v10000                           = 295,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddHostNode_v10000                                   = 296,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeGetParams_v10000                             = 297,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddChildGraphNode_v10000                             = 298,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphChildGraphNodeGetGraph_v10000                        = 299,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEmptyNode_v10000                                  = 300,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphClone_v10000                                         = 301,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeFindInClone_v10000                               = 302,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetType_v10000                                   = 303,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetRootNodes_v10000                                  = 304,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependencies_v10000                           = 305,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependentNodes_v10000                         = 306,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddDependencies_v10000                               = 307,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRemoveDependencies_v10000                            = 308,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroyNode_v10000                                   = 309,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiate_v10000                                   = 310,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000                                        = 311,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000                                   = 312,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecDestroy_v10000                                   = 313,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroy_v10000                                       = 314,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_v10000                                 = 315,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_ptsz_v10000                            = 316,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_v10000                                  = 317,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_ptsz_v10000                             = 318,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_v10000                                   = 319,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_ptsz_v10000                              = 320,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeSetParams_v10000                             = 321,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetNodes_v10000                                      = 322,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetEdges_v10000                                      = 323,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v10010                               = 324,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_ptsz_v10010                          = 325,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecKernelNodeSetParams_v10010                       = 326,
+    CUPTI_RUNTIME_TRACE_CBID_cudaThreadExchangeStreamCaptureMode_v10010                    = 327,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetNvSciSyncAttributes_v10020                       = 328,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyAvailableDynamicSMemPerBlock_v10200              = 329,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_v10200                                     = 330,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_ptsz_v10200                                = 331,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams_v10020                       = 332,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemsetNodeSetParams_v10020                       = 333,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecHostNodeSetParams_v10020                         = 334,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecUpdate_v10020                                    = 335,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetFuncBySymbol_v11000                                    = 336,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCtxResetPersistingL2Cache_v11000                          = 337,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeCopyAttributes_v11000                      = 338,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetAttribute_v11000                        = 339,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetAttribute_v11000                        = 340,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_v11000                               = 341,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_ptsz_v11000                          = 342,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_v11000                                 = 343,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_ptsz_v11000                            = 344,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_v11000                                 = 345,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_ptsz_v11000                            = 346,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetTexture1DLinearMaxWidth_v11010                   = 347,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_v10000                                        = 348,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_ptsz_v10000                                   = 349,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeToSymbol_v11010                         = 350,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeFromSymbol_v11010                       = 351,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode1D_v11010                               = 352,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsToSymbol_v11010                   = 353,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsFromSymbol_v11010                 = 354,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams1D_v11010                         = 355,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010               = 356,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010             = 357,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams1D_v11010                     = 358,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetSparseProperties_v11010                           = 359,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetSparseProperties_v11010                  = 360,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecChildGraphNodeSetParams_v11010                   = 361,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventRecordNode_v11010                            = 362,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeGetEvent_v11010                       = 363,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeSetEvent_v11010                       = 364,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventWaitNode_v11010                              = 365,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeGetEvent_v11010                         = 366,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeSetEvent_v11010                         = 367,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventRecordNodeSetEvent_v11010                   = 368,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventWaitNodeSetEvent_v11010                     = 369,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_v11010                               = 370,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_ptsz_v11010                          = 371,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetDefaultMemPool_v11020                            = 372,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_v11020                                        = 373,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_ptsz_v11020                                   = 374,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_v11020                                          = 375,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_ptsz_v11020                                     = 376,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolTrimTo_v11020                                      = 377,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAttribute_v11020                                = 378,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAttribute_v11020                                = 379,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAccess_v11020                                   = 380,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetPlane_v11020                                      = 381,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAccess_v11020                                   = 382,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolCreate_v11020                                      = 383,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolDestroy_v11020                                     = 384,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetMemPool_v11020                                   = 385,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetMemPool_v11020                                   = 386,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportToShareableHandle_v11020                     = 387,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportFromShareableHandle_v11020                   = 388,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportPointer_v11020                               = 389,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportPointer_v11020                               = 390,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_v11020                                = 391,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_ptsz_v11020                           = 392,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_v11020                   = 393,
+    CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020              = 394,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_v11020                     = 395,
+    CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020                = 396,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresSignalNode_v11020               = 397,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeGetParams_v11020         = 398,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeSetParams_v11020         = 399,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresWaitNode_v11020                 = 400,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeGetParams_v11020           = 401,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeSetParams_v11020           = 402,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020     = 403,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020       = 404,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceFlushGPUDirectRDMAWrites_v11030                     = 405,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_v11030                                = 406,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_ptsz_v11030                           = 407,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphDebugDotPrint_v11030                                 = 408,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_v11030                            = 409,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_ptsz_v11030                       = 410,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v11030                    = 411,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_ptsz_v11030               = 412,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectCreate_v11030                                   = 413,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRetain_v11030                                   = 414,
+    CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRelease_v11030                                  = 415,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRetainUserObject_v11030                              = 416,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphReleaseUserObject_v11030                             = 417,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithFlags_v11040                          = 418,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemAllocNode_v11040                               = 419,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemAllocNodeGetParams_v11040                         = 420,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemFreeNode_v11040                                = 421,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemFreeNodeGetParams_v11040                          = 422,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGraphMemTrim_v11040                                 = 423,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetGraphMemAttribute_v11040                         = 424,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetGraphMemAttribute_v11040                         = 425,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeSetEnabled_v11060                                = 426,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetEnabled_v11060                                = 427,
+    CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetMemoryRequirements_v11060                         = 428,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetMemoryRequirements_v11060                = 429,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060                                    = 430,
+    CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060                               = 431,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxPotentialClusterSize_v11070                   = 432,
+    CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveClusters_v11070                         = 433,
+    CUPTI_RUNTIME_TRACE_CBID_cudaCreateTextureObject_v2_v11080                             = 434,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectTextureDesc_v2_v11080                     = 435,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithParams_v12000                         = 436,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithParams_ptsz_v12000                    = 437,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecGetFlags_v12000                                  = 438,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetKernel_v12000                                          = 439,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v2_v12000                             = 440,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetId_v12000                                        = 441,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetId_ptsz_v12000                                   = 442,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiate_v12000                                   = 443,
+    CUPTI_RUNTIME_TRACE_CBID_cudaInitDevice_v12000                                         = 444,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddNode_v12020                                       = 445,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeSetParams_v12020                                 = 446,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecNodeSetParams_v12020                             = 447,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemAdvise_v2_v12020                                       = 448,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v2_v12020                                = 449,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v2_ptsz_v12020                           = 450,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetName_v12030                                        = 451,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCaptureToGraph_v12030                          = 452,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCaptureToGraph_ptsz_v12030                     = 453,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphConditionalHandleCreate_v12030                       = 454,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetEdges_v2_v12030                                   = 455,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependencies_v2_v12030                        = 456,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependentNodes_v2_v12030                      = 457,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddDependencies_v2_v12030                            = 458,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphRemoveDependencies_v2_v12030                         = 459,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddNode_v2_v12030                                    = 460,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v3_v12030                            = 461,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v3_ptsz_v12030                       = 462,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v2_v12030                 = 463,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v2_ptsz_v12030            = 464,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceRegisterAsyncNotification_v12040                    = 465,
+    CUPTI_RUNTIME_TRACE_CBID_cudaDeviceUnregisterAsyncNotification_v12040                  = 466,
+    CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetParamInfo_v12040                                   = 467,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPointByVersion_v12050                       = 468,
+    CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPointByVersion_ptsz_v12050                  = 469,
+    CUPTI_RUNTIME_TRACE_CBID_cuda470_v12060                                                = 470,
+    CUPTI_RUNTIME_TRACE_CBID_cuda471_v12060                                                = 471,
+    CUPTI_RUNTIME_TRACE_CBID_cuda472_v12060                                                = 472,
+    CUPTI_RUNTIME_TRACE_CBID_cuda473_v12060                                                = 473,
+    CUPTI_RUNTIME_TRACE_CBID_cuda474_v12060                                                = 474,
+    CUPTI_RUNTIME_TRACE_CBID_cuda475_v12060                                                = 475,
+    CUPTI_RUNTIME_TRACE_CBID_cuda476_v12060                                                = 476,
+    CUPTI_RUNTIME_TRACE_CBID_cuda477_v12060                                                = 477,
+    CUPTI_RUNTIME_TRACE_CBID_cuda478_v12060                                                = 478,
+    CUPTI_RUNTIME_TRACE_CBID_cuda479_v12060                                                = 479,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetDevice_v12080                                    = 480,
+    CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetDevice_ptsz_v12080                               = 481,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyBatchAsync_v12080                                   = 482,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyBatchAsync_ptsz_v12080                              = 483,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DBatchAsync_v12080                                 = 484,
+    CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DBatchAsync_ptsz_v12080                            = 485,
+    CUPTI_RUNTIME_TRACE_CBID_cudaEventElapsedTime_v2_v12080                                = 486,
+    CUPTI_RUNTIME_TRACE_CBID_SIZE                                                          = 487,
+    CUPTI_RUNTIME_TRACE_CBID_FORCE_INT                                                     = 0x7fffffff
+} CUpti_runtime_api_trace_cbid;
+#endif

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_functions.h ADDED Viewed

	@@ -0,0 +1,65 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("device_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "device_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
+#endif
+#include "crt/device_functions.h"
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
+#endif

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/library_types.h ADDED Viewed

	@@ -0,0 +1,111 @@

+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__LIBRARY_TYPES_H__)
+#define __LIBRARY_TYPES_H__
+#ifndef __CUDACC_RTC_MINIMAL__
+typedef enum cudaDataType_t
+{
+    CUDA_R_16F  =  2, /* real as a half */
+    CUDA_C_16F  =  6, /* complex as a pair of half numbers */
+    CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
+    CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
+    CUDA_R_32F  =  0, /* real as a float */
+    CUDA_C_32F  =  4, /* complex as a pair of float numbers */
+    CUDA_R_64F  =  1, /* real as a double */
+    CUDA_C_64F  =  5, /* complex as a pair of double numbers */
+    CUDA_R_4I   = 16, /* real as a signed 4-bit int */
+    CUDA_C_4I   = 17, /* complex as a pair of signed 4-bit int numbers */
+    CUDA_R_4U   = 18, /* real as a unsigned 4-bit int */
+    CUDA_C_4U   = 19, /* complex as a pair of unsigned 4-bit int numbers */
+    CUDA_R_8I   =  3, /* real as a signed 8-bit int */
+    CUDA_C_8I   =  7, /* complex as a pair of signed 8-bit int numbers */
+    CUDA_R_8U   =  8, /* real as a unsigned 8-bit int */
+    CUDA_C_8U   =  9, /* complex as a pair of unsigned 8-bit int numbers */
+    CUDA_R_16I  = 20, /* real as a signed 16-bit int */
+    CUDA_C_16I  = 21, /* complex as a pair of signed 16-bit int numbers */
+    CUDA_R_16U  = 22, /* real as a unsigned 16-bit int */
+    CUDA_C_16U  = 23, /* complex as a pair of unsigned 16-bit int numbers */
+    CUDA_R_32I  = 10, /* real as a signed 32-bit int */
+    CUDA_C_32I  = 11, /* complex as a pair of signed 32-bit int numbers */
+    CUDA_R_32U  = 12, /* real as a unsigned 32-bit int */
+    CUDA_C_32U  = 13, /* complex as a pair of unsigned 32-bit int numbers */
+    CUDA_R_64I  = 24, /* real as a signed 64-bit int */
+    CUDA_C_64I  = 25, /* complex as a pair of signed 64-bit int numbers */
+    CUDA_R_64U  = 26, /* real as a unsigned 64-bit int */
+    CUDA_C_64U  = 27, /* complex as a pair of unsigned 64-bit int numbers */
+    CUDA_R_8F_E4M3 = 28, /* real as a nv_fp8_e4m3 */
+    CUDA_R_8F_UE4M3 = CUDA_R_8F_E4M3, /* real as an unsigned nv_fp8_e4m3 */
+    CUDA_R_8F_E5M2 = 29, /* real as a nv_fp8_e5m2 */
+    CUDA_R_8F_UE8M0 = 30,  /* real as an exponent-only unsigned nv_fp8_e8m0 */
+    CUDA_R_6F_E2M3  = 31,  /* real as a nv_fp6_e2m3 */
+    CUDA_R_6F_E3M2  = 32,  /* real as a nv_fp6_e3m2 */
+    CUDA_R_4F_E2M1  = 33,  /* real as a nv_fp4_e2m1 */
+} cudaDataType;
+typedef enum libraryPropertyType_t
+{
+    MAJOR_VERSION,
+    MINOR_VERSION,
+    PATCH_LEVEL
+} libraryPropertyType;
+#ifndef __cplusplus
+typedef enum cudaDataType_t cudaDataType_t;
+typedef enum libraryPropertyType_t libraryPropertyType_t;
+#endif
+#endif  /* !__CUDACC_RTC_MINIMAL__ */
+#endif /* !__LIBRARY_TYPES_H__ */

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_cuda_host.h ADDED Viewed

	@@ -0,0 +1,179 @@

+#ifndef NVPERF_CUDA_HOST_H
+#define NVPERF_CUDA_HOST_H
+/*
+ * Copyright 2014-2024 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO USER:
+ *
+ * This source code is subject to NVIDIA ownership rights under U.S. and
+ * international Copyright laws.
+ *
+ * This software and the information contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
+ * of a form of NVIDIA software license agreement.
+ *
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
+ * CODE FOR ANY PURPOSE.  IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
+ * IMPLIED WARRANTY OF ANY KIND.  NVIDIA DISCLAIMS ALL WARRANTIES WITH
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
+ * OF USE, DATA OR PROFITS,  WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
+ * OR OTHER TORTIOUS ACTION,  ARISING OUT OF OR IN CONNECTION WITH THE USE
+ * OR PERFORMANCE OF THIS SOURCE CODE.
+ *
+ * U.S. Government End Users.   This source code is a "commercial item" as
+ * that term is defined at  48 C.F.R. 2.101 (OCT 1995), consisting  of
+ * "commercial computer  software"  and "commercial computer software
+ * documentation" as such terms are  used in 48 C.F.R. 12.212 (SEPT 1995)
+ * and is provided to the U.S. Government only as a commercial end item.
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
+ * source code with only those rights set forth herein.
+ *
+ * Any use of this source code in individual and commercial software must
+ * include, in the user documentation and internal comments to the code,
+ * the above Disclaimer and U.S. Government End Users Notice.
+ */
+#include <stddef.h>
+#include <stdint.h>
+#include "nvperf_common.h"
+#include "nvperf_host.h"
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility push(default)
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
+    #endif
+#else
+    #if !defined(NVPW_LOCAL)
+        #define NVPW_LOCAL
+    #endif
+#endif
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ *  @file   nvperf_cuda_host.h
+ */
+    typedef struct NVPW_CUDA_RawMetricsConfig_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        NVPA_ActivityKind activityKind;
+        /// [in]
+        const char* pChipName;
+        /// [out] new NVPA_RawMetricsConfig object
+        struct NVPA_RawMetricsConfig* pRawMetricsConfig;
+    } NVPW_CUDA_RawMetricsConfig_Create_Params;
+#define NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_RawMetricsConfig_Create_Params, pRawMetricsConfig)
+    NVPA_Status NVPW_CUDA_RawMetricsConfig_Create(NVPW_CUDA_RawMetricsConfig_Create_Params* pParams);
+    typedef struct NVPW_CUDA_RawMetricsConfig_Create_V2_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        NVPA_ActivityKind activityKind;
+        /// [in] accepted for chips supported at the time-of-release.
+        const char* pChipName;
+        /// [in] buffer with counter availability image - required for future chip support
+        const uint8_t* pCounterAvailabilityImage;
+        /// [out] new NVPA_RawMetricsConfig object
+        struct NVPA_RawMetricsConfig* pRawMetricsConfig;
+    } NVPW_CUDA_RawMetricsConfig_Create_V2_Params;
+#define NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_RawMetricsConfig_Create_V2_Params, pRawMetricsConfig)
+    /// Use either 'pChipName' or 'pCounterAvailabilityImage'.
+    NVPA_Status NVPW_CUDA_RawMetricsConfig_Create_V2(NVPW_CUDA_RawMetricsConfig_Create_V2_Params* pParams);
+    typedef struct NVPW_CUDA_CounterDataBuilder_Create_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] accepted for chips supported at the time-of-release.
+        const char* pChipName;
+        /// [in] buffer with counter availability image - required for future chip support
+        const uint8_t* pCounterAvailabilityImage;
+        /// [out] new NVPA_CounterDataBuilder object
+        struct NVPA_CounterDataBuilder* pCounterDataBuilder;
+    } NVPW_CUDA_CounterDataBuilder_Create_Params;
+#define NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_CounterDataBuilder_Create_Params, pCounterDataBuilder)
+    /// Use either 'pChipName' or 'pCounterAvailabilityImage'.
+    NVPA_Status NVPW_CUDA_CounterDataBuilder_Create(NVPW_CUDA_CounterDataBuilder_Create_Params* pParams);
+    typedef struct NVPW_MetricsEvaluator NVPW_MetricsEvaluator;
+    typedef struct NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in] accepted for chips supported at the time-of-release.
+        const char* pChipName;
+        /// [in] buffer with counter availability image - required for future chip support
+        const uint8_t* pCounterAvailabilityImage;
+        /// [out]
+        size_t scratchBufferSize;
+    } NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params;
+#define NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params, scratchBufferSize)
+    /// Use either 'pChipName' or 'pCounterAvailabilityImage'.
+    NVPA_Status NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params* pParams);
+    typedef struct NVPW_CUDA_MetricsEvaluator_Initialize_Params
+    {
+        /// [in]
+        size_t structSize;
+        /// [in] assign to NULL
+        void* pPriv;
+        /// [in]
+        uint8_t* pScratchBuffer;
+        /// [in] the size of the 'pScratchBuffer' array, should be at least the size of the 'scratchBufferSize' returned
+        /// by 'NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize'
+        size_t scratchBufferSize;
+        /// [in] accepted for chips supported at the time-of-release.
+        const char* pChipName;
+        /// [in] buffer with counter availability image - required for future chip support
+        const uint8_t* pCounterAvailabilityImage;
+        /// [in]
+        const uint8_t* pCounterDataImage;
+        /// [in] must be provided if 'pCounterDataImage' is not NULL
+        size_t counterDataImageSize;
+        /// [out]
+        struct NVPW_MetricsEvaluator* pMetricsEvaluator;
+    } NVPW_CUDA_MetricsEvaluator_Initialize_Params;
+#define NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_MetricsEvaluator_Initialize_Params, pMetricsEvaluator)
+    /// Use one of 'pChipName', 'pCounterAvailabilityImage', or 'pCounterDataImage'. 'pChipName' or
+    /// 'pCounterAvailabilityImage' will create a metrics evaluator based on a virtual device while 'pCounterDataImage'
+    /// will create a metrics evaluator based on the actual device.
+    NVPA_Status NVPW_CUDA_MetricsEvaluator_Initialize(NVPW_CUDA_MetricsEvaluator_Initialize_Params* pParams);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
+    #pragma GCC visibility pop
+#endif
+#endif // NVPERF_CUDA_HOST_H

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.hpp ADDED Viewed

	@@ -0,0 +1,588 @@

+/*
+ * Copyright 1993-2020 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_32_INTRINSICS_HPP__)
+#define __SM_32_INTRINSICS_HPP__
+#if defined(__CUDACC_RTC__)
+#define __SM_32_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+// In here are intrinsics which are built in to the compiler. These may be
+// referenced by intrinsic implementations from this file.
+extern "C"
+{
+    // There are no intrinsics built in to the compiler for SM-3.5,
+    // all intrinsics are now implemented as inline PTX below.
+}
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-3.5 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+// LDG is a "load from global via texture path" command which can exhibit higher
+// bandwidth on GK110 than a regular LD.
+// Define a different pointer storage size for 64 and 32 bit
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif
+/******************************************************************************
+ *                                   __ldg                                    *
+ ******************************************************************************/
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+__SM_32_INTRINSICS_DECL__ char __ldg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldg(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.nc.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.nc.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.nc.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.nc.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.nc.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.nc.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.nc.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned char __ldg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.nc.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.nc.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.nc.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.nc.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.nc.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.nc.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float __ldg(const float *ptr) { float ret; asm volatile ("ld.global.nc.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldg(const double *ptr) { double ret; asm volatile ("ld.global.nc.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.nc.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.nc.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.nc.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+/******************************************************************************
+ *                                   __ldcg                                    *
+ ******************************************************************************/
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+__SM_32_INTRINSICS_DECL__ char __ldcg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldcg(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldcg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.cg.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldcg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldcg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cg.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldcg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.cg.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldcg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.cg.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldcg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.cg.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldcg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.cg.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldcg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cg.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldcg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cg.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.cg.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.cg.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.cg.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cg.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.cg.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.cg.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.cg.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.cg.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldcg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.cg.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldcg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.cg.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float __ldcg(const float *ptr) { float ret; asm volatile ("ld.global.cg.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldcg(const double *ptr) { double ret; asm volatile ("ld.global.cg.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldcg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cg.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldcg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cg.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldcg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cg.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+/******************************************************************************
+ *                                   __ldca                                    *
+ ******************************************************************************/
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+__SM_32_INTRINSICS_DECL__ char __ldca(const char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldca(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldca(const short *ptr) { unsigned short ret; asm volatile ("ld.global.ca.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldca(const int *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldca(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.ca.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldca(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.ca.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldca(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.ca.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldca(const short2 *ptr) { short2 ret; asm volatile ("ld.global.ca.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldca(const short4 *ptr) { short4 ret; asm volatile ("ld.global.ca.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldca(const int2 *ptr) { int2 ret; asm volatile ("ld.global.ca.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldca(const int4 *ptr) { int4 ret; asm volatile ("ld.global.ca.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldca(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.ca.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned char __ldca(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldca(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.ca.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldca(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.ca.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldca(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.ca.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldca(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.ca.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldca(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.ca.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldca(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.ca.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldca(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.ca.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldca(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.ca.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldca(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.ca.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldca(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.ca.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float __ldca(const float *ptr) { float ret; asm volatile ("ld.global.ca.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldca(const double *ptr) { double ret; asm volatile ("ld.global.ca.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldca(const float2 *ptr) { float2 ret; asm volatile ("ld.global.ca.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldca(const float4 *ptr) { float4 ret; asm volatile ("ld.global.ca.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldca(const double2 *ptr) { double2 ret; asm volatile ("ld.global.ca.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+/******************************************************************************
+ *                                   __ldcs                                    *
+ ******************************************************************************/
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+#endif
+__SM_32_INTRINSICS_DECL__ char __ldcs(const char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldcs(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldcs(const short *ptr) { unsigned short ret; asm volatile ("ld.global.cs.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldcs(const int *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldcs(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cs.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldcs(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.cs.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldcs(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.cs.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldcs(const short2 *ptr) { short2 ret; asm volatile ("ld.global.cs.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldcs(const short4 *ptr) { short4 ret; asm volatile ("ld.global.cs.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldcs(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cs.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldcs(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cs.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcs(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.cs.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcs(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr));  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcs(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.cs.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcs(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.cs.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcs(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cs.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcs(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.cs.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcs(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.cs.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcs(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.cs.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcs(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.cs.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldcs(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.cs.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldcs(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcs(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.cs.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float __ldcs(const float *ptr) { float ret; asm volatile ("ld.global.cs.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldcs(const double *ptr) { double ret; asm volatile ("ld.global.cs.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldcs(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cs.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldcs(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cs.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldcs(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cs.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
+/******************************************************************************
+ *                                   __ldlu                                    *
+ ******************************************************************************/
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) { unsigned long ret; asm ("ld.global.lu.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.lu.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) { unsigned long ret; asm ("ld.global.lu.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.lu.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#endif
+__SM_32_INTRINSICS_DECL__ char __ldlu(const char *ptr) { unsigned int ret; asm ("ld.global.lu.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldlu(const signed char *ptr) { unsigned int ret; asm ("ld.global.lu.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldlu(const short *ptr) { unsigned short ret; asm ("ld.global.lu.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldlu(const int *ptr) { unsigned int ret; asm ("ld.global.lu.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldlu(const long long *ptr) { unsigned long long ret; asm ("ld.global.lu.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldlu(const char2 *ptr) { char2 ret; int2 tmp; asm ("ld.global.lu.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldlu(const char4 *ptr) { char4 ret; int4 tmp; asm ("ld.global.lu.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldlu(const short2 *ptr) { short2 ret; asm ("ld.global.lu.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldlu(const short4 *ptr) { short4 ret; asm ("ld.global.lu.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldlu(const int2 *ptr) { int2 ret; asm ("ld.global.lu.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldlu(const int4 *ptr) { int4 ret; asm ("ld.global.lu.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldlu(const longlong2 *ptr) { longlong2 ret; asm ("ld.global.lu.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned char __ldlu(const unsigned char *ptr) { unsigned int ret; asm ("ld.global.lu.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory");  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldlu(const unsigned short *ptr) { unsigned short ret; asm ("ld.global.lu.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldlu(const unsigned int *ptr) { unsigned int ret; asm ("ld.global.lu.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldlu(const unsigned long long *ptr) { unsigned long long ret; asm ("ld.global.lu.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldlu(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm ("ld.global.lu.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldlu(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm ("ld.global.lu.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldlu(const ushort2 *ptr) { ushort2 ret; asm ("ld.global.lu.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldlu(const ushort4 *ptr) { ushort4 ret; asm ("ld.global.lu.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldlu(const uint2 *ptr) { uint2 ret; asm ("ld.global.lu.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldlu(const uint4 *ptr) { uint4 ret; asm ("ld.global.lu.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldlu(const ulonglong2 *ptr) { ulonglong2 ret; asm ("ld.global.lu.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float __ldlu(const float *ptr) { float ret; asm ("ld.global.lu.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldlu(const double *ptr) { double ret; asm ("ld.global.lu.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldlu(const float2 *ptr) { float2 ret; asm ("ld.global.lu.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldlu(const float4 *ptr) { float4 ret; asm ("ld.global.lu.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldlu(const double2 *ptr) { double2 ret; asm ("ld.global.lu.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+/******************************************************************************
+ *                                   __ldcv                                    *
+ ******************************************************************************/
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) { unsigned long ret; asm ("ld.global.cv.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.cv.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) { unsigned long ret; asm ("ld.global.cv.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.cv.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+#endif
+__SM_32_INTRINSICS_DECL__ char __ldcv(const char *ptr) { unsigned int ret; asm ("ld.global.cv.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (char)ret; }
+__SM_32_INTRINSICS_DECL__ signed char __ldcv(const signed char *ptr) { unsigned int ret; asm ("ld.global.cv.s8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (signed char)ret; }
+__SM_32_INTRINSICS_DECL__ short __ldcv(const short *ptr) { unsigned short ret; asm ("ld.global.cv.s16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return (short)ret; }
+__SM_32_INTRINSICS_DECL__ int __ldcv(const int *ptr) { unsigned int ret; asm ("ld.global.cv.s32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (int)ret; }
+__SM_32_INTRINSICS_DECL__ long long __ldcv(const long long *ptr) { unsigned long long ret; asm ("ld.global.cv.s64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long long)ret; }
+__SM_32_INTRINSICS_DECL__ char2 __ldcv(const char2 *ptr) { char2 ret; int2 tmp; asm ("ld.global.cv.v2.s8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ char4 __ldcv(const char4 *ptr) { char4 ret; int4 tmp; asm ("ld.global.cv.v4.s8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ short2 __ldcv(const short2 *ptr) { short2 ret; asm ("ld.global.cv.v2.s16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ short4 __ldcv(const short4 *ptr) { short4 ret; asm ("ld.global.cv.v4.s16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int2 __ldcv(const int2 *ptr) { int2 ret; asm ("ld.global.cv.v2.s32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ int4 __ldcv(const int4 *ptr) { int4 ret; asm ("ld.global.cv.v4.s32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ longlong2 __ldcv(const longlong2 *ptr) { longlong2 ret; asm ("ld.global.cv.v2.s64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned char __ldcv(const unsigned char *ptr) { unsigned int ret; asm ("ld.global.cv.u8 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory");  return (unsigned char)ret; }
+__SM_32_INTRINSICS_DECL__ unsigned short __ldcv(const unsigned short *ptr) { unsigned short ret; asm ("ld.global.cv.u16 %0, [%1];"  : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned int __ldcv(const unsigned int *ptr) { unsigned int ret; asm ("ld.global.cv.u32 %0, [%1];"  : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ unsigned long long __ldcv(const unsigned long long *ptr) { unsigned long long ret; asm ("ld.global.cv.u64 %0, [%1];"  : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uchar2 __ldcv(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm ("ld.global.cv.v2.u8 {%0,%1}, [%2];"  : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
+__SM_32_INTRINSICS_DECL__ uchar4 __ldcv(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm ("ld.global.cv.v4.u8 {%0,%1,%2,%3}, [%4];"  : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
+__SM_32_INTRINSICS_DECL__ ushort2 __ldcv(const ushort2 *ptr) { ushort2 ret; asm ("ld.global.cv.v2.u16 {%0,%1}, [%2];"  : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ushort4 __ldcv(const ushort4 *ptr) { ushort4 ret; asm ("ld.global.cv.v4.u16 {%0,%1,%2,%3}, [%4];"  : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint2 __ldcv(const uint2 *ptr) { uint2 ret; asm ("ld.global.cv.v2.u32 {%0,%1}, [%2];"  : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ uint4 __ldcv(const uint4 *ptr) { uint4 ret; asm ("ld.global.cv.v4.u32 {%0,%1,%2,%3}, [%4];"  : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ ulonglong2 __ldcv(const ulonglong2 *ptr) { ulonglong2 ret; asm ("ld.global.cv.v2.u64 {%0,%1}, [%2];"  : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float __ldcv(const float *ptr) { float ret; asm ("ld.global.cv.f32 %0, [%1];"  : "=f"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double __ldcv(const double *ptr) { double ret; asm ("ld.global.cv.f64 %0, [%1];"  : "=d"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float2 __ldcv(const float2 *ptr) { float2 ret; asm ("ld.global.cv.v2.f32 {%0,%1}, [%2];"  : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ float4 __ldcv(const float4 *ptr) { float4 ret; asm ("ld.global.cv.v4.f32 {%0,%1,%2,%3}, [%4];"  : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
+__SM_32_INTRINSICS_DECL__ double2 __ldcv(const double2 *ptr) { double2 ret; asm ("ld.global.cv.v2.f64 {%0,%1}, [%2];"  : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
+/******************************************************************************
+ *                                   __stwb                                    *
+ ******************************************************************************/
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) { asm ("st.global.wb.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) { asm ("st.global.wb.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) { asm ("st.global.wb.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) { asm ("st.global.wb.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+__SM_32_INTRINSICS_DECL__ void __stwb(char *ptr, char value) { asm ("st.global.wb.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(signed char *ptr, signed char value) { asm ("st.global.wb.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(short *ptr, short value) { asm ("st.global.wb.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(int *ptr, int value) { asm ("st.global.wb.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(long long *ptr, long long value) { asm ("st.global.wb.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.wb.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wb.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(short2 *ptr, short2 value) { asm ("st.global.wb.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(short4 *ptr, short4 value) { asm ("st.global.wb.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(int2 *ptr, int2 value) { asm ("st.global.wb.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(int4 *ptr, int4 value) { asm ("st.global.wb.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(longlong2 *ptr, longlong2 value) { asm ("st.global.wb.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned char *ptr, unsigned char value) { asm ("st.global.wb.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned short *ptr, unsigned short value) { asm ("st.global.wb.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned int *ptr, unsigned int value) { asm ("st.global.wb.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(unsigned long long *ptr, unsigned long long value) { asm ("st.global.wb.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.wb.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wb.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort2 *ptr, ushort2 value) { asm ("st.global.wb.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(ushort4 *ptr, ushort4 value) { asm ("st.global.wb.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uint2 *ptr, uint2 value) { asm ("st.global.wb.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(uint4 *ptr, uint4 value) { asm ("st.global.wb.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.wb.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(float *ptr, float value) { asm ("st.global.wb.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(double *ptr, double value) { asm ("st.global.wb.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(float2 *ptr, float2 value) { asm ("st.global.wb.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(float4 *ptr, float4 value) { asm ("st.global.wb.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwb(double2 *ptr, double2 value) { asm ("st.global.wb.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+/******************************************************************************
+ *                                   __stcg                                    *
+ ******************************************************************************/
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) { asm ("st.global.cg.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) { asm ("st.global.cg.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) { asm ("st.global.cg.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) { asm ("st.global.cg.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+__SM_32_INTRINSICS_DECL__ void __stcg(char *ptr, char value) { asm ("st.global.cg.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(signed char *ptr, signed char value) { asm ("st.global.cg.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(short *ptr, short value) { asm ("st.global.cg.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(int *ptr, int value) { asm ("st.global.cg.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(long long *ptr, long long value) { asm ("st.global.cg.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.cg.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cg.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(short2 *ptr, short2 value) { asm ("st.global.cg.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(short4 *ptr, short4 value) { asm ("st.global.cg.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(int2 *ptr, int2 value) { asm ("st.global.cg.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(int4 *ptr, int4 value) { asm ("st.global.cg.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(longlong2 *ptr, longlong2 value) { asm ("st.global.cg.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned char *ptr, unsigned char value) { asm ("st.global.cg.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned short *ptr, unsigned short value) { asm ("st.global.cg.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned int *ptr, unsigned int value) { asm ("st.global.cg.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(unsigned long long *ptr, unsigned long long value) { asm ("st.global.cg.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.cg.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cg.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort2 *ptr, ushort2 value) { asm ("st.global.cg.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(ushort4 *ptr, ushort4 value) { asm ("st.global.cg.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uint2 *ptr, uint2 value) { asm ("st.global.cg.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(uint4 *ptr, uint4 value) { asm ("st.global.cg.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.cg.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(float *ptr, float value) { asm ("st.global.cg.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(double *ptr, double value) { asm ("st.global.cg.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(float2 *ptr, float2 value) { asm ("st.global.cg.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(float4 *ptr, float4 value) { asm ("st.global.cg.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcg(double2 *ptr, double2 value) { asm ("st.global.cg.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+/******************************************************************************
+ *                                   __stcs                                    *
+ ******************************************************************************/
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) { asm ("st.global.cs.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) { asm ("st.global.cs.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) { asm ("st.global.cs.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) { asm ("st.global.cs.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+__SM_32_INTRINSICS_DECL__ void __stcs(char *ptr, char value) { asm ("st.global.cs.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(signed char *ptr, signed char value) { asm ("st.global.cs.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(short *ptr, short value) { asm ("st.global.cs.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(int *ptr, int value) { asm ("st.global.cs.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(long long *ptr, long long value) { asm ("st.global.cs.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.cs.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cs.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(short2 *ptr, short2 value) { asm ("st.global.cs.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(short4 *ptr, short4 value) { asm ("st.global.cs.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(int2 *ptr, int2 value) { asm ("st.global.cs.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(int4 *ptr, int4 value) { asm ("st.global.cs.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(longlong2 *ptr, longlong2 value) { asm ("st.global.cs.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned char *ptr, unsigned char value) { asm ("st.global.cs.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned short *ptr, unsigned short value) { asm ("st.global.cs.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned int *ptr, unsigned int value) { asm ("st.global.cs.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(unsigned long long *ptr, unsigned long long value) { asm ("st.global.cs.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.cs.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cs.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort2 *ptr, ushort2 value) { asm ("st.global.cs.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(ushort4 *ptr, ushort4 value) { asm ("st.global.cs.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uint2 *ptr, uint2 value) { asm ("st.global.cs.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(uint4 *ptr, uint4 value) { asm ("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.cs.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(float *ptr, float value) { asm ("st.global.cs.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(double *ptr, double value) { asm ("st.global.cs.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(float2 *ptr, float2 value) { asm ("st.global.cs.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(float4 *ptr, float4 value) { asm ("st.global.cs.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stcs(double2 *ptr, double2 value) { asm ("st.global.cs.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+/******************************************************************************
+ *                                   __stwt                                    *
+ ******************************************************************************/
+// Size of long is architecture and OS specific.
+#if defined(__LP64__) // 64 bits
+__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) { asm ("st.global.wt.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) { asm ("st.global.wt.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+#else // 32 bits
+__SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) { asm ("st.global.wt.s32 [%0], %1;"  :: __LDG_PTR (ptr),  "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) { asm ("st.global.wt.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+#endif
+__SM_32_INTRINSICS_DECL__ void __stwt(char *ptr, char value) { asm ("st.global.wt.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(signed char *ptr, signed char value) { asm ("st.global.wt.s8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(short *ptr, short value) { asm ("st.global.wt.s16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(int *ptr, int value) { asm ("st.global.wt.s32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(long long *ptr, long long value) { asm ("st.global.wt.s64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.wt.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wt.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(short2 *ptr, short2 value) { asm ("st.global.wt.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(short4 *ptr, short4 value) { asm ("st.global.wt.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(int2 *ptr, int2 value) { asm ("st.global.wt.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(int4 *ptr, int4 value) { asm ("st.global.wt.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(longlong2 *ptr, longlong2 value) { asm ("st.global.wt.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned char *ptr, unsigned char value) { asm ("st.global.wt.u8 [%0], %1;"  :: __LDG_PTR (ptr), "r"((int)value) : "memory");  }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned short *ptr, unsigned short value) { asm ("st.global.wt.u16 [%0], %1;"  :: __LDG_PTR (ptr), "h"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned int *ptr, unsigned int value) { asm ("st.global.wt.u32 [%0], %1;"  :: __LDG_PTR (ptr), "r"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(unsigned long long *ptr, unsigned long long value) { asm ("st.global.wt.u64 [%0], %1;"  :: __LDG_PTR (ptr), "l"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.wt.v2.u8 [%0], {%1,%2};"  :: __LDG_PTR (ptr),  "r"(x), "r"(y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wt.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort2 *ptr, ushort2 value) { asm ("st.global.wt.v2.u16 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(ushort4 *ptr, ushort4 value) { asm ("st.global.wt.v4.u16 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uint2 *ptr, uint2 value) { asm ("st.global.wt.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(uint4 *ptr, uint4 value) { asm ("st.global.wt.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.wt.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(float *ptr, float value) { asm ("st.global.wt.f32 [%0], %1;"  :: __LDG_PTR (ptr), "f"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(double *ptr, double value) { asm ("st.global.wt.f64 [%0], %1;"  :: __LDG_PTR (ptr), "d"(value) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(float2 *ptr, float2 value) { asm ("st.global.wt.v2.f32 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(float4 *ptr, float4 value) { asm ("st.global.wt.v4.f32 [%0], {%1,%2,%3,%4};"  :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
+__SM_32_INTRINSICS_DECL__ void __stwt(double2 *ptr, double2 value) { asm ("st.global.wt.v2.f64 [%0], {%1,%2};"  :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
+#undef __LDG_PTR
+// SHF is the "funnel shift" operation - an accelerated left/right shift with carry
+// operating on 64-bit quantities, which are concatenations of two 32-bit registers.
+// This shifts [b:a] left by "shift" bits, returning the most significant bits of the result.
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.l.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+// This shifts [b:a] right by "shift" bits, returning the least significant bits of the result.
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+__SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
+{
+    unsigned int ret;
+    asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
+    return ret;
+}
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+#endif /* __cplusplus && __CUDACC__ */
+#undef __SM_32_INTRINSICS_DECL__
+#endif /* !__SM_32_INTRINSICS_HPP__ */

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.h ADDED Viewed

	@@ -0,0 +1,330 @@

+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+//NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
+#define EXCLUDE_FROM_RTC
+#if !defined(__SM_60_ATOMIC_FUNCTIONS_H__)
+#define __SM_60_ATOMIC_FUNCTIONS_H__
+#if defined(__CUDACC_RTC__)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
+#elif defined(_NVHPC_CUDA)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
+#else /* __CUDACC_RTC__ */
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+/* Add !defined(_NVHPC_CUDA) to avoid empty function definition in CUDA
+ * C++ compiler where the macro __CUDA_ARCH__ is not defined. */
+#if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
+#define __DEF_IF_HOST { }
+#else  /* !__CUDA_ARCH__ */
+#define __DEF_IF_HOST ;
+#endif /* __CUDA_ARCH__ */
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_block(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_system(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_block(float *address, float val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_system(float *address, float val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_block(double *address, double val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_system(double *address, double val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_block(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_system(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_block(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_system(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_block(float *address, float val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_system(float *address, float val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_block(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_system(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_block(long long *address, long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_system(long long *address, long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_block(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_system(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_block(long long *address, long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_system(long long *address, long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_block(int *address, int compare, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_system(int *address, int compare, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
+                             unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
+                              unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_block(unsigned long long int *address,
+                                       unsigned long long int compare,
+                                       unsigned long long int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_system(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_block(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_system(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_block(long long *address, long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_system(long long *address, long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_block(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_system(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_block(long long *address, long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_system(long long *address, long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_block(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_system(int *address, int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_block(long long *address, long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_system(long long *address, long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
+#endif /* __cplusplus && __CUDACC__ */
+#undef __SM_60_ATOMIC_FUNCTIONS_DECL__
+#undef __DEF_IF_HOST
+#if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
+#include "sm_60_atomic_functions.hpp"
+#endif /* !__CUDACC_RTC__  && defined(__CUDA_ARCH__)  */
+#endif /* !__SM_60_ATOMIC_FUNCTIONS_H__ */
+#undef EXCLUDE_FROM_RTC

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.hpp ADDED Viewed

	@@ -0,0 +1,161 @@

+/*
+ * Copyright 2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_61_INTRINSICS_HPP__)
+#define __SM_61_INTRINSICS_HPP__
+#if defined(__CUDACC_RTC__)
+#define __SM_61_INTRINSICS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+#if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+/*******************************************************************************
+*                                                                              *
+*  Below are implementations of SM-6.1 intrinsics which are included as        *
+*  source (instead of being built in to the compiler)                          *
+*                                                                              *
+*******************************************************************************/
+// 4a
+__SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+// 2a.lo
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+// 2a.hi
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) {
+    int ret;
+    asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
+    return ret;
+}
+__SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) {
+    unsigned int ret;
+    asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
+    return ret;
+}
+#endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
+#endif /* __cplusplus && __CUDACC__ */
+#undef __SM_61_INTRINSICS_DECL__
+#endif /* !__SM_61_INTRINSICS_HPP__ */

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (6.96 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/convert.cpython-312.pyc ADDED Viewed

Binary file (16.2 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/pack.cpython-312.pyc ADDED Viewed

Binary file (4.49 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/tags.cpython-312.pyc ADDED Viewed

Binary file (6.78 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/unpack.cpython-312.pyc ADDED Viewed

Binary file (1.56 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (220 Bytes). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/LICENSE ADDED Viewed

	@@ -0,0 +1,3 @@

+This software is made available under the terms of *either* of the licenses
+found in LICENSE.APACHE or LICENSE.BSD. Contributions to this software is made
+under the terms of *both* these licenses.

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/LICENSE.APACHE ADDED Viewed

	@@ -0,0 +1,177 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/LICENSE.BSD ADDED Viewed

	@@ -0,0 +1,23 @@

+Copyright (c) Donald Stufft and individual contributors.
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+    1. Redistributions of source code must retain the above copyright notice,
+       this list of conditions and the following disclaimer.
+    2. Redistributions in binary form must reproduce the above copyright
+       notice, this list of conditions and the following disclaimer in the
+       documentation and/or other materials provided with the distribution.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__init__.py ADDED Viewed

File without changes

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (230 Bytes). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_elffile.cpython-312.pyc ADDED Viewed

Binary file (5.06 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_manylinux.cpython-312.pyc ADDED Viewed

Binary file (9.93 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_musllinux.cpython-312.pyc ADDED Viewed

Binary file (4.61 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_parser.cpython-312.pyc ADDED Viewed

Binary file (14.1 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_structures.cpython-312.pyc ADDED Viewed

Binary file (3.28 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_tokenizer.cpython-312.pyc ADDED Viewed

Binary file (7.97 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/markers.cpython-312.pyc ADDED Viewed

Binary file (10.6 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/requirements.cpython-312.pyc ADDED Viewed

Binary file (4.49 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/specifiers.cpython-312.pyc ADDED Viewed

Binary file (39.6 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/tags.cpython-312.pyc ADDED Viewed

Binary file (21.8 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (7.32 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/version.cpython-312.pyc ADDED Viewed

Binary file (20 kB). View file

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_elffile.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+ELF file parser.
+This provides a class ``ELFFile`` that parses an ELF executable in a similar
+interface to ``ZipFile``. Only the read interface is implemented.
+Based on: https://gist.github.com/lyssdod/f51579ae8d93c8657a5564aefc2ffbca
+ELF header: https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html
+"""
+import enum
+import os
+import struct
+from typing import IO, Optional, Tuple
+class ELFInvalid(ValueError):
+    pass
+class EIClass(enum.IntEnum):
+    C32 = 1
+    C64 = 2
+class EIData(enum.IntEnum):
+    Lsb = 1
+    Msb = 2
+class EMachine(enum.IntEnum):
+    I386 = 3
+    S390 = 22
+    Arm = 40
+    X8664 = 62
+    AArc64 = 183
+class ELFFile:
+    """
+    Representation of an ELF executable.
+    """
+    def __init__(self, f: IO[bytes]) -> None:
+        self._f = f
+        try:
+            ident = self._read("16B")
+        except struct.error:
+            raise ELFInvalid("unable to parse identification")
+        magic = bytes(ident[:4])
+        if magic != b"\x7fELF":
+            raise ELFInvalid(f"invalid magic: {magic!r}")
+        self.capacity = ident[4]  # Format for program header (bitness).
+        self.encoding = ident[5]  # Data structure encoding (endianness).
+        try:
+            # e_fmt: Format for program header.
+            # p_fmt: Format for section header.
+            # p_idx: Indexes to find p_type, p_offset, and p_filesz.
+            e_fmt, self._p_fmt, self._p_idx = {
+                (1, 1): ("<HHIIIIIHHH", "<IIIIIIII", (0, 1, 4)),  # 32-bit LSB.
+                (1, 2): (">HHIIIIIHHH", ">IIIIIIII", (0, 1, 4)),  # 32-bit MSB.
+                (2, 1): ("<HHIQQQIHHH", "<IIQQQQQQ", (0, 2, 5)),  # 64-bit LSB.
+                (2, 2): (">HHIQQQIHHH", ">IIQQQQQQ", (0, 2, 5)),  # 64-bit MSB.
+            }[(self.capacity, self.encoding)]
+        except KeyError:
+            raise ELFInvalid(
+                f"unrecognized capacity ({self.capacity}) or "
+                f"encoding ({self.encoding})"
+            )
+        try:
+            (
+                _,
+                self.machine,  # Architecture type.
+                _,
+                _,
+                self._e_phoff,  # Offset of program header.
+                _,
+                self.flags,  # Processor-specific flags.
+                _,
+                self._e_phentsize,  # Size of section.
+                self._e_phnum,  # Number of sections.
+            ) = self._read(e_fmt)
+        except struct.error as e:
+            raise ELFInvalid("unable to parse machine and section information") from e
+    def _read(self, fmt: str) -> Tuple[int, ...]:
+        return struct.unpack(fmt, self._f.read(struct.calcsize(fmt)))
+    @property
+    def interpreter(self) -> Optional[str]:
+        """
+        The path recorded in the ``PT_INTERP`` section header.
+        """
+        for index in range(self._e_phnum):
+            self._f.seek(self._e_phoff + self._e_phentsize * index)
+            try:
+                data = self._read(self._p_fmt)
+            except struct.error:
+                continue
+            if data[self._p_idx[0]] != 3:  # Not PT_INTERP.
+                continue
+            self._f.seek(data[self._p_idx[1]])
+            return os.fsdecode(self._f.read(data[self._p_idx[2]])).strip("\0")
+        return None

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_musllinux.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""PEP 656 support.
+This module implements logic to detect if the currently running Python is
+linked against musl, and what musl version is used.
+"""
+import functools
+import re
+import subprocess
+import sys
+from typing import Iterator, NamedTuple, Optional, Sequence
+from ._elffile import ELFFile
+class _MuslVersion(NamedTuple):
+    major: int
+    minor: int
+def _parse_musl_version(output: str) -> Optional[_MuslVersion]:
+    lines = [n for n in (n.strip() for n in output.splitlines()) if n]
+    if len(lines) < 2 or lines[0][:4] != "musl":
+        return None
+    m = re.match(r"Version (\d+)\.(\d+)", lines[1])
+    if not m:
+        return None
+    return _MuslVersion(major=int(m.group(1)), minor=int(m.group(2)))
+@functools.lru_cache
+def _get_musl_version(executable: str) -> Optional[_MuslVersion]:
+    """Detect currently-running musl runtime version.
+    This is done by checking the specified executable's dynamic linking
+    information, and invoking the loader to parse its output for a version
+    string. If the loader is musl, the output would be something like::
+        musl libc (x86_64)
+        Version 1.2.2
+        Dynamic Program Loader
+    """
+    try:
+        with open(executable, "rb") as f:
+            ld = ELFFile(f).interpreter
+    except (OSError, TypeError, ValueError):
+        return None
+    if ld is None or "musl" not in ld:
+        return None
+    proc = subprocess.run([ld], stderr=subprocess.PIPE, text=True)
+    return _parse_musl_version(proc.stderr)
+def platform_tags(archs: Sequence[str]) -> Iterator[str]:
+    """Generate musllinux tags compatible to the current platform.
+    :param archs: Sequence of compatible architectures.
+        The first one shall be the closest to the actual architecture and be the part of
+        platform tag after the ``linux_`` prefix, e.g. ``x86_64``.
+        The ``linux_`` prefix is assumed as a prerequisite for the current platform to
+        be musllinux-compatible.
+    :returns: An iterator of compatible musllinux tags.
+    """
+    sys_musl = _get_musl_version(sys.executable)
+    if sys_musl is None:  # Python not dynamically linked against musl.
+        return
+    for arch in archs:
+        for minor in range(sys_musl.minor, -1, -1):
+            yield f"musllinux_{sys_musl.major}_{minor}_{arch}"
+if __name__ == "__main__":  # pragma: no cover
+    import sysconfig
+    plat = sysconfig.get_platform()
+    assert plat.startswith("linux-"), "not linux"
+    print("plat:", plat)
+    print("musl:", _get_musl_version(sys.executable))
+    print("tags:", end=" ")
+    for t in platform_tags(re.sub(r"[.-]", "_", plat.split("-", 1)[-1])):
+        print(t, end="\n      ")

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_parser.py ADDED Viewed

	@@ -0,0 +1,356 @@

+"""Handwritten parser of dependency specifiers.
+The docstring for each __parse_* function contains EBNF-inspired grammar representing
+the implementation.
+"""
+import ast
+from typing import Any, List, NamedTuple, Optional, Tuple, Union
+from ._tokenizer import DEFAULT_RULES, Tokenizer
+class Node:
+    def __init__(self, value: str) -> None:
+        self.value = value
+    def __str__(self) -> str:
+        return self.value
+    def __repr__(self) -> str:
+        return f"<{self.__class__.__name__}('{self}')>"
+    def serialize(self) -> str:
+        raise NotImplementedError
+class Variable(Node):
+    def serialize(self) -> str:
+        return str(self)
+class Value(Node):
+    def serialize(self) -> str:
+        return f'"{self}"'
+class Op(Node):
+    def serialize(self) -> str:
+        return str(self)
+MarkerVar = Union[Variable, Value]
+MarkerItem = Tuple[MarkerVar, Op, MarkerVar]
+# MarkerAtom = Union[MarkerItem, List["MarkerAtom"]]
+# MarkerList = List[Union["MarkerList", MarkerAtom, str]]
+# mypy does not support recursive type definition
+# https://github.com/python/mypy/issues/731
+MarkerAtom = Any
+MarkerList = List[Any]
+class ParsedRequirement(NamedTuple):
+    name: str
+    url: str
+    extras: List[str]
+    specifier: str
+    marker: Optional[MarkerList]
+# --------------------------------------------------------------------------------------
+# Recursive descent parser for dependency specifier
+# --------------------------------------------------------------------------------------
+def parse_requirement(source: str) -> ParsedRequirement:
+    return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES))
+def _parse_requirement(tokenizer: Tokenizer) -> ParsedRequirement:
+    """
+    requirement = WS? IDENTIFIER WS? extras WS? requirement_details
+    """
+    tokenizer.consume("WS")
+    name_token = tokenizer.expect(
+        "IDENTIFIER", expected="package name at the start of dependency specifier"
+    )
+    name = name_token.text
+    tokenizer.consume("WS")
+    extras = _parse_extras(tokenizer)
+    tokenizer.consume("WS")
+    url, specifier, marker = _parse_requirement_details(tokenizer)
+    tokenizer.expect("END", expected="end of dependency specifier")
+    return ParsedRequirement(name, url, extras, specifier, marker)
+def _parse_requirement_details(
+    tokenizer: Tokenizer,
+) -> Tuple[str, str, Optional[MarkerList]]:
+    """
+    requirement_details = AT URL (WS requirement_marker?)?
+                        | specifier WS? (requirement_marker)?
+    """
+    specifier = ""
+    url = ""
+    marker = None
+    if tokenizer.check("AT"):
+        tokenizer.read()
+        tokenizer.consume("WS")
+        url_start = tokenizer.position
+        url = tokenizer.expect("URL", expected="URL after @").text
+        if tokenizer.check("END", peek=True):
+            return (url, specifier, marker)
+        tokenizer.expect("WS", expected="whitespace after URL")
+        # The input might end after whitespace.
+        if tokenizer.check("END", peek=True):
+            return (url, specifier, marker)
+        marker = _parse_requirement_marker(
+            tokenizer, span_start=url_start, after="URL and whitespace"
+        )
+    else:
+        specifier_start = tokenizer.position
+        specifier = _parse_specifier(tokenizer)
+        tokenizer.consume("WS")
+        if tokenizer.check("END", peek=True):
+            return (url, specifier, marker)
+        marker = _parse_requirement_marker(
+            tokenizer,
+            span_start=specifier_start,
+            after=(
+                "version specifier"
+                if specifier
+                else "name and no valid version specifier"
+            ),
+        )
+    return (url, specifier, marker)
+def _parse_requirement_marker(
+    tokenizer: Tokenizer, *, span_start: int, after: str
+) -> MarkerList:
+    """
+    requirement_marker = SEMICOLON marker WS?
+    """
+    if not tokenizer.check("SEMICOLON"):
+        tokenizer.raise_syntax_error(
+            f"Expected end or semicolon (after {after})",
+            span_start=span_start,
+        )
+    tokenizer.read()
+    marker = _parse_marker(tokenizer)
+    tokenizer.consume("WS")
+    return marker
+def _parse_extras(tokenizer: Tokenizer) -> List[str]:
+    """
+    extras = (LEFT_BRACKET wsp* extras_list? wsp* RIGHT_BRACKET)?
+    """
+    if not tokenizer.check("LEFT_BRACKET", peek=True):
+        return []
+    with tokenizer.enclosing_tokens(
+        "LEFT_BRACKET",
+        "RIGHT_BRACKET",
+        around="extras",
+    ):
+        tokenizer.consume("WS")
+        extras = _parse_extras_list(tokenizer)
+        tokenizer.consume("WS")
+    return extras
+def _parse_extras_list(tokenizer: Tokenizer) -> List[str]:
+    """
+    extras_list = identifier (wsp* ',' wsp* identifier)*
+    """
+    extras: List[str] = []
+    if not tokenizer.check("IDENTIFIER"):
+        return extras
+    extras.append(tokenizer.read().text)
+    while True:
+        tokenizer.consume("WS")
+        if tokenizer.check("IDENTIFIER", peek=True):
+            tokenizer.raise_syntax_error("Expected comma between extra names")
+        elif not tokenizer.check("COMMA"):
+            break
+        tokenizer.read()
+        tokenizer.consume("WS")
+        extra_token = tokenizer.expect("IDENTIFIER", expected="extra name after comma")
+        extras.append(extra_token.text)
+    return extras
+def _parse_specifier(tokenizer: Tokenizer) -> str:
+    """
+    specifier = LEFT_PARENTHESIS WS? version_many WS? RIGHT_PARENTHESIS
+              | WS? version_many WS?
+    """
+    with tokenizer.enclosing_tokens(
+        "LEFT_PARENTHESIS",
+        "RIGHT_PARENTHESIS",
+        around="version specifier",
+    ):
+        tokenizer.consume("WS")
+        parsed_specifiers = _parse_version_many(tokenizer)
+        tokenizer.consume("WS")
+    return parsed_specifiers
+def _parse_version_many(tokenizer: Tokenizer) -> str:
+    """
+    version_many = (SPECIFIER (WS? COMMA WS? SPECIFIER)*)?
+    """
+    parsed_specifiers = ""
+    while tokenizer.check("SPECIFIER"):
+        span_start = tokenizer.position
+        parsed_specifiers += tokenizer.read().text
+        if tokenizer.check("VERSION_PREFIX_TRAIL", peek=True):
+            tokenizer.raise_syntax_error(
+                ".* suffix can only be used with `==` or `!=` operators",
+                span_start=span_start,
+                span_end=tokenizer.position + 1,
+            )
+        if tokenizer.check("VERSION_LOCAL_LABEL_TRAIL", peek=True):
+            tokenizer.raise_syntax_error(
+                "Local version label can only be used with `==` or `!=` operators",
+                span_start=span_start,
+                span_end=tokenizer.position,
+            )
+        tokenizer.consume("WS")
+        if not tokenizer.check("COMMA"):
+            break
+        parsed_specifiers += tokenizer.read().text
+        tokenizer.consume("WS")
+    return parsed_specifiers
+# --------------------------------------------------------------------------------------
+# Recursive descent parser for marker expression
+# --------------------------------------------------------------------------------------
+def parse_marker(source: str) -> MarkerList:
+    return _parse_full_marker(Tokenizer(source, rules=DEFAULT_RULES))
+def _parse_full_marker(tokenizer: Tokenizer) -> MarkerList:
+    retval = _parse_marker(tokenizer)
+    tokenizer.expect("END", expected="end of marker expression")
+    return retval
+def _parse_marker(tokenizer: Tokenizer) -> MarkerList:
+    """
+    marker = marker_atom (BOOLOP marker_atom)+
+    """
+    expression = [_parse_marker_atom(tokenizer)]
+    while tokenizer.check("BOOLOP"):
+        token = tokenizer.read()
+        expr_right = _parse_marker_atom(tokenizer)
+        expression.extend((token.text, expr_right))
+    return expression
+def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom:
+    """
+    marker_atom = WS? LEFT_PARENTHESIS WS? marker WS? RIGHT_PARENTHESIS WS?
+                | WS? marker_item WS?
+    """
+    tokenizer.consume("WS")
+    if tokenizer.check("LEFT_PARENTHESIS", peek=True):
+        with tokenizer.enclosing_tokens(
+            "LEFT_PARENTHESIS",
+            "RIGHT_PARENTHESIS",
+            around="marker expression",
+        ):
+            tokenizer.consume("WS")
+            marker: MarkerAtom = _parse_marker(tokenizer)
+            tokenizer.consume("WS")
+    else:
+        marker = _parse_marker_item(tokenizer)
+    tokenizer.consume("WS")
+    return marker
+def _parse_marker_item(tokenizer: Tokenizer) -> MarkerItem:
+    """
+    marker_item = WS? marker_var WS? marker_op WS? marker_var WS?
+    """
+    tokenizer.consume("WS")
+    marker_var_left = _parse_marker_var(tokenizer)
+    tokenizer.consume("WS")
+    marker_op = _parse_marker_op(tokenizer)
+    tokenizer.consume("WS")
+    marker_var_right = _parse_marker_var(tokenizer)
+    tokenizer.consume("WS")
+    return (marker_var_left, marker_op, marker_var_right)
+def _parse_marker_var(tokenizer: Tokenizer) -> MarkerVar:
+    """
+    marker_var = VARIABLE | QUOTED_STRING
+    """
+    if tokenizer.check("VARIABLE"):
+        return process_env_var(tokenizer.read().text.replace(".", "_"))
+    elif tokenizer.check("QUOTED_STRING"):
+        return process_python_str(tokenizer.read().text)
+    else:
+        tokenizer.raise_syntax_error(
+            message="Expected a marker variable or quoted string"
+        )
+def process_env_var(env_var: str) -> Variable:
+    if env_var in ("platform_python_implementation", "python_implementation"):
+        return Variable("platform_python_implementation")
+    else:
+        return Variable(env_var)
+def process_python_str(python_str: str) -> Value:
+    value = ast.literal_eval(python_str)
+    return Value(str(value))
+def _parse_marker_op(tokenizer: Tokenizer) -> Op:
+    """
+    marker_op = IN | NOT IN | OP
+    """
+    if tokenizer.check("IN"):
+        tokenizer.read()
+        return Op("in")
+    elif tokenizer.check("NOT"):
+        tokenizer.read()
+        tokenizer.expect("WS", expected="whitespace after 'not'")
+        tokenizer.expect("IN", expected="'in' after 'not'")
+        return Op("not in")
+    elif tokenizer.check("OP"):
+        return Op(tokenizer.read().text)
+    else:
+        return tokenizer.raise_syntax_error(
+            "Expected marker operator, one of "
+            "<=, <, !=, ==, >=, >, ~=, ===, in, not in"
+        )

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_structures.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+class InfinityType:
+    def __repr__(self) -> str:
+        return "Infinity"
+    def __hash__(self) -> int:
+        return hash(repr(self))
+    def __lt__(self, other: object) -> bool:
+        return False
+    def __le__(self, other: object) -> bool:
+        return False
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, self.__class__)
+    def __gt__(self, other: object) -> bool:
+        return True
+    def __ge__(self, other: object) -> bool:
+        return True
+    def __neg__(self: object) -> "NegativeInfinityType":
+        return NegativeInfinity
+Infinity = InfinityType()
+class NegativeInfinityType:
+    def __repr__(self) -> str:
+        return "-Infinity"
+    def __hash__(self) -> int:
+        return hash(repr(self))
+    def __lt__(self, other: object) -> bool:
+        return True
+    def __le__(self, other: object) -> bool:
+        return True
+    def __eq__(self, other: object) -> bool:
+        return isinstance(other, self.__class__)
+    def __gt__(self, other: object) -> bool:
+        return False
+    def __ge__(self, other: object) -> bool:
+        return False
+    def __neg__(self: object) -> InfinityType:
+        return Infinity
+NegativeInfinity = NegativeInfinityType()

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_tokenizer.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import contextlib
+import re
+from dataclasses import dataclass
+from typing import Dict, Iterator, NoReturn, Optional, Tuple, Union
+from .specifiers import Specifier
+@dataclass
+class Token:
+    name: str
+    text: str
+    position: int
+class ParserSyntaxError(Exception):
+    """The provided source text could not be parsed correctly."""
+    def __init__(
+        self,
+        message: str,
+        *,
+        source: str,
+        span: Tuple[int, int],
+    ) -> None:
+        self.span = span
+        self.message = message
+        self.source = source
+        super().__init__()
+    def __str__(self) -> str:
+        marker = " " * self.span[0] + "~" * (self.span[1] - self.span[0]) + "^"
+        return "\n    ".join([self.message, self.source, marker])
+DEFAULT_RULES: "Dict[str, Union[str, re.Pattern[str]]]" = {
+    "LEFT_PARENTHESIS": r"\(",
+    "RIGHT_PARENTHESIS": r"\)",
+    "LEFT_BRACKET": r"\[",
+    "RIGHT_BRACKET": r"\]",
+    "SEMICOLON": r";",
+    "COMMA": r",",
+    "QUOTED_STRING": re.compile(
+        r"""
+            (
+                ('[^']*')
+                |
+                ("[^"]*")
+            )
+        """,
+        re.VERBOSE,
+    ),
+    "OP": r"(===|==|~=|!=|<=|>=|<|>)",
+    "BOOLOP": r"\b(or|and)\b",
+    "IN": r"\bin\b",
+    "NOT": r"\bnot\b",
+    "VARIABLE": re.compile(
+        r"""
+            \b(
+                python_version
+                |python_full_version
+                |os[._]name
+                |sys[._]platform
+                |platform_(release|system)
+                |platform[._](version|machine|python_implementation)
+                |python_implementation
+                |implementation_(name|version)
+                |extra
+            )\b
+        """,
+        re.VERBOSE,
+    ),
+    "SPECIFIER": re.compile(
+        Specifier._operator_regex_str + Specifier._version_regex_str,
+        re.VERBOSE | re.IGNORECASE,
+    ),
+    "AT": r"\@",
+    "URL": r"[^ \t]+",
+    "IDENTIFIER": r"\b[a-zA-Z0-9][a-zA-Z0-9._-]*\b",
+    "VERSION_PREFIX_TRAIL": r"\.\*",
+    "VERSION_LOCAL_LABEL_TRAIL": r"\+[a-z0-9]+(?:[-_\.][a-z0-9]+)*",
+    "WS": r"[ \t]+",
+    "END": r"$",
+}
+class Tokenizer:
+    """Context-sensitive token parsing.
+    Provides methods to examine the input stream to check whether the next token
+    matches.
+    """
+    def __init__(
+        self,
+        source: str,
+        *,
+        rules: "Dict[str, Union[str, re.Pattern[str]]]",
+    ) -> None:
+        self.source = source
+        self.rules: Dict[str, re.Pattern[str]] = {
+            name: re.compile(pattern) for name, pattern in rules.items()
+        }
+        self.next_token: Optional[Token] = None
+        self.position = 0
+    def consume(self, name: str) -> None:
+        """Move beyond provided token name, if at current position."""
+        if self.check(name):
+            self.read()
+    def check(self, name: str, *, peek: bool = False) -> bool:
+        """Check whether the next token has the provided name.
+        By default, if the check succeeds, the token *must* be read before
+        another check. If `peek` is set to `True`, the token is not loaded and
+        would need to be checked again.
+        """
+        assert (
+            self.next_token is None
+        ), f"Cannot check for {name!r}, already have {self.next_token!r}"
+        assert name in self.rules, f"Unknown token name: {name!r}"
+        expression = self.rules[name]
+        match = expression.match(self.source, self.position)
+        if match is None:
+            return False
+        if not peek:
+            self.next_token = Token(name, match[0], self.position)
+        return True
+    def expect(self, name: str, *, expected: str) -> Token:
+        """Expect a certain token name next, failing with a syntax error otherwise.
+        The token is *not* read.
+        """
+        if not self.check(name):
+            raise self.raise_syntax_error(f"Expected {expected}")
+        return self.read()
+    def read(self) -> Token:
+        """Consume the next token and return it."""
+        token = self.next_token
+        assert token is not None
+        self.position += len(token.text)
+        self.next_token = None
+        return token
+    def raise_syntax_error(
+        self,
+        message: str,
+        *,
+        span_start: Optional[int] = None,
+        span_end: Optional[int] = None,
+    ) -> NoReturn:
+        """Raise ParserSyntaxError at the given position."""
+        span = (
+            self.position if span_start is None else span_start,
+            self.position if span_end is None else span_end,
+        )
+        raise ParserSyntaxError(
+            message,
+            source=self.source,
+            span=span,
+        )
+    @contextlib.contextmanager
+    def enclosing_tokens(
+        self, open_token: str, close_token: str, *, around: str
+    ) -> Iterator[None]:
+        if self.check(open_token):
+            open_position = self.position
+            self.read()
+        else:
+            open_position = None
+        yield
+        if open_position is None:
+            return
+        if not self.check(close_token):
+            self.raise_syntax_error(
+                f"Expected matching {close_token} for {open_token}, after {around}",
+                span_start=open_position,
+            )
+        self.read()

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/markers.py ADDED Viewed

	@@ -0,0 +1,253 @@

+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+import operator
+import os
+import platform
+import sys
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from ._parser import (
+    MarkerAtom,
+    MarkerList,
+    Op,
+    Value,
+    Variable,
+)
+from ._parser import (
+    parse_marker as _parse_marker,
+)
+from ._tokenizer import ParserSyntaxError
+from .specifiers import InvalidSpecifier, Specifier
+from .utils import canonicalize_name
+__all__ = [
+    "InvalidMarker",
+    "UndefinedComparison",
+    "UndefinedEnvironmentName",
+    "Marker",
+    "default_environment",
+]
+Operator = Callable[[str, str], bool]
+class InvalidMarker(ValueError):
+    """
+    An invalid marker was found, users should refer to PEP 508.
+    """
+class UndefinedComparison(ValueError):
+    """
+    An invalid operation was attempted on a value that doesn't support it.
+    """
+class UndefinedEnvironmentName(ValueError):
+    """
+    A name was attempted to be used that does not exist inside of the
+    environment.
+    """
+def _normalize_extra_values(results: Any) -> Any:
+    """
+    Normalize extra values.
+    """
+    if isinstance(results[0], tuple):
+        lhs, op, rhs = results[0]
+        if isinstance(lhs, Variable) and lhs.value == "extra":
+            normalized_extra = canonicalize_name(rhs.value)
+            rhs = Value(normalized_extra)
+        elif isinstance(rhs, Variable) and rhs.value == "extra":
+            normalized_extra = canonicalize_name(lhs.value)
+            lhs = Value(normalized_extra)
+        results[0] = lhs, op, rhs
+    return results
+def _format_marker(
+    marker: Union[List[str], MarkerAtom, str], first: Optional[bool] = True
+) -> str:
+    assert isinstance(marker, (list, tuple, str))
+    # Sometimes we have a structure like [[...]] which is a single item list
+    # where the single item is itself it's own list. In that case we want skip
+    # the rest of this function so that we don't get extraneous () on the
+    # outside.
+    if (
+        isinstance(marker, list)
+        and len(marker) == 1
+        and isinstance(marker[0], (list, tuple))
+    ):
+        return _format_marker(marker[0])
+    if isinstance(marker, list):
+        inner = (_format_marker(m, first=False) for m in marker)
+        if first:
+            return " ".join(inner)
+        else:
+            return "(" + " ".join(inner) + ")"
+    elif isinstance(marker, tuple):
+        return " ".join([m.serialize() for m in marker])
+    else:
+        return marker
+_operators: Dict[str, Operator] = {
+    "in": lambda lhs, rhs: lhs in rhs,
+    "not in": lambda lhs, rhs: lhs not in rhs,
+    "<": operator.lt,
+    "<=": operator.le,
+    "==": operator.eq,
+    "!=": operator.ne,
+    ">=": operator.ge,
+    ">": operator.gt,
+}
+def _eval_op(lhs: str, op: Op, rhs: str) -> bool:
+    try:
+        spec = Specifier("".join([op.serialize(), rhs]))
+    except InvalidSpecifier:
+        pass
+    else:
+        return spec.contains(lhs, prereleases=True)
+    oper: Optional[Operator] = _operators.get(op.serialize())
+    if oper is None:
+        raise UndefinedComparison(f"Undefined {op!r} on {lhs!r} and {rhs!r}.")
+    return oper(lhs, rhs)
+def _normalize(*values: str, key: str) -> Tuple[str, ...]:
+    # PEP 685 – Comparison of extra names for optional distribution dependencies
+    # https://peps.python.org/pep-0685/
+    # > When comparing extra names, tools MUST normalize the names being
+    # > compared using the semantics outlined in PEP 503 for names
+    if key == "extra":
+        return tuple(canonicalize_name(v) for v in values)
+    # other environment markers don't have such standards
+    return values
+def _evaluate_markers(markers: MarkerList, environment: Dict[str, str]) -> bool:
+    groups: List[List[bool]] = [[]]
+    for marker in markers:
+        assert isinstance(marker, (list, tuple, str))
+        if isinstance(marker, list):
+            groups[-1].append(_evaluate_markers(marker, environment))
+        elif isinstance(marker, tuple):
+            lhs, op, rhs = marker
+            if isinstance(lhs, Variable):
+                environment_key = lhs.value
+                lhs_value = environment[environment_key]
+                rhs_value = rhs.value
+            else:
+                lhs_value = lhs.value
+                environment_key = rhs.value
+                rhs_value = environment[environment_key]
+            lhs_value, rhs_value = _normalize(lhs_value, rhs_value, key=environment_key)
+            groups[-1].append(_eval_op(lhs_value, op, rhs_value))
+        else:
+            assert marker in ["and", "or"]
+            if marker == "or":
+                groups.append([])
+    return any(all(item) for item in groups)
+def format_full_version(info: "sys._version_info") -> str:
+    version = "{0.major}.{0.minor}.{0.micro}".format(info)
+    kind = info.releaselevel
+    if kind != "final":
+        version += kind[0] + str(info.serial)
+    return version
+def default_environment() -> Dict[str, str]:
+    iver = format_full_version(sys.implementation.version)
+    implementation_name = sys.implementation.name
+    return {
+        "implementation_name": implementation_name,
+        "implementation_version": iver,
+        "os_name": os.name,
+        "platform_machine": platform.machine(),
+        "platform_release": platform.release(),
+        "platform_system": platform.system(),
+        "platform_version": platform.version(),
+        "python_full_version": platform.python_version(),
+        "platform_python_implementation": platform.python_implementation(),
+        "python_version": ".".join(platform.python_version_tuple()[:2]),
+        "sys_platform": sys.platform,
+    }
+class Marker:
+    def __init__(self, marker: str) -> None:
+        # Note: We create a Marker object without calling this constructor in
+        #       packaging.requirements.Requirement. If any additional logic is
+        #       added here, make sure to mirror/adapt Requirement.
+        try:
+            self._markers = _normalize_extra_values(_parse_marker(marker))
+            # The attribute `_markers` can be described in terms of a recursive type:
+            # MarkerList = List[Union[Tuple[Node, ...], str, MarkerList]]
+            #
+            # For example, the following expression:
+            # python_version > "3.6" or (python_version == "3.6" and os_name == "unix")
+            #
+            # is parsed into:
+            # [
+            #     (<Variable('python_version')>, <Op('>')>, <Value('3.6')>),
+            #     'and',
+            #     [
+            #         (<Variable('python_version')>, <Op('==')>, <Value('3.6')>),
+            #         'or',
+            #         (<Variable('os_name')>, <Op('==')>, <Value('unix')>)
+            #     ]
+            # ]
+        except ParserSyntaxError as e:
+            raise InvalidMarker(str(e)) from e
+    def __str__(self) -> str:
+        return _format_marker(self._markers)
+    def __repr__(self) -> str:
+        return f"<Marker('{self}')>"
+    def __hash__(self) -> int:
+        return hash((self.__class__.__name__, str(self)))
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, Marker):
+            return NotImplemented
+        return str(self) == str(other)
+    def evaluate(self, environment: Optional[Dict[str, str]] = None) -> bool:
+        """Evaluate a marker.
+        Return the boolean from evaluating the given marker against the
+        environment. environment is an optional argument to override all or
+        part of the determined environment.
+        The environment is determined from the current Python process.
+        """
+        current_environment = default_environment()
+        current_environment["extra"] = ""
+        if environment is not None:
+            current_environment.update(environment)
+            # The API used to allow setting extra to None. We need to handle this
+            # case for backwards compatibility.
+            if current_environment["extra"] is None:
+                current_environment["extra"] = ""
+        return _evaluate_markers(self._markers, current_environment)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/specifiers.py ADDED Viewed

	@@ -0,0 +1,1011 @@

+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+"""
+.. testsetup::
+    from packaging.specifiers import Specifier, SpecifierSet, InvalidSpecifier
+    from packaging.version import Version
+"""
+import abc
+import itertools
+import re
+from typing import Callable, Iterable, Iterator, List, Optional, Tuple, TypeVar, Union
+from .utils import canonicalize_version
+from .version import Version
+UnparsedVersion = Union[Version, str]
+UnparsedVersionVar = TypeVar("UnparsedVersionVar", bound=UnparsedVersion)
+CallableOperator = Callable[[Version, str], bool]
+def _coerce_version(version: UnparsedVersion) -> Version:
+    if not isinstance(version, Version):
+        version = Version(version)
+    return version
+class InvalidSpecifier(ValueError):
+    """
+    Raised when attempting to create a :class:`Specifier` with a specifier
+    string that is invalid.
+    >>> Specifier("lolwat")
+    Traceback (most recent call last):
+        ...
+    packaging.specifiers.InvalidSpecifier: Invalid specifier: 'lolwat'
+    """
+class BaseSpecifier(metaclass=abc.ABCMeta):
+    @abc.abstractmethod
+    def __str__(self) -> str:
+        """
+        Returns the str representation of this Specifier-like object. This
+        should be representative of the Specifier itself.
+        """
+    @abc.abstractmethod
+    def __hash__(self) -> int:
+        """
+        Returns a hash value for this Specifier-like object.
+        """
+    @abc.abstractmethod
+    def __eq__(self, other: object) -> bool:
+        """
+        Returns a boolean representing whether or not the two Specifier-like
+        objects are equal.
+        :param other: The other object to check against.
+        """
+    @property
+    @abc.abstractmethod
+    def prereleases(self) -> Optional[bool]:
+        """Whether or not pre-releases as a whole are allowed.
+        This can be set to either ``True`` or ``False`` to explicitly enable or disable
+        prereleases or it can be set to ``None`` (the default) to use default semantics.
+        """
+    @prereleases.setter
+    def prereleases(self, value: bool) -> None:
+        """Setter for :attr:`prereleases`.
+        :param value: The value to set.
+        """
+    @abc.abstractmethod
+    def contains(self, item: str, prereleases: Optional[bool] = None) -> bool:
+        """
+        Determines if the given item is contained within this specifier.
+        """
+    @abc.abstractmethod
+    def filter(
+        self, iterable: Iterable[UnparsedVersionVar], prereleases: Optional[bool] = None
+    ) -> Iterator[UnparsedVersionVar]:
+        """
+        Takes an iterable of items and filters them so that only items which
+        are contained within this specifier are allowed in it.
+        """
+class Specifier(BaseSpecifier):
+    """This class abstracts handling of version specifiers.
+    .. tip::
+        It is generally not required to instantiate this manually. You should instead
+        prefer to work with :class:`SpecifierSet` instead, which can parse
+        comma-separated version specifiers (which is what package metadata contains).
+    """
+    _operator_regex_str = r"""
+        (?P<operator>(~=|==|!=|<=|>=|<|>|===))
+        """
+    _version_regex_str = r"""
+        (?P<version>
+            (?:
+                # The identity operators allow for an escape hatch that will
+                # do an exact string match of the version you wish to install.
+                # This will not be parsed by PEP 440 and we cannot determine
+                # any semantic meaning from it. This operator is discouraged
+                # but included entirely as an escape hatch.
+                (?<====)  # Only match for the identity operator
+                \s*
+                [^\s;)]*  # The arbitrary version can be just about anything,
+                          # we match everything except for whitespace, a
+                          # semi-colon for marker support, and a closing paren
+                          # since versions can be enclosed in them.
+            )
+            |
+            (?:
+                # The (non)equality operators allow for wild card and local
+                # versions to be specified so we have to define these two
+                # operators separately to enable that.
+                (?<===|!=)            # Only match for equals and not equals
+                \s*
+                v?
+                (?:[0-9]+!)?          # epoch
+                [0-9]+(?:\.[0-9]+)*   # release
+                # You cannot use a wild card and a pre-release, post-release, a dev or
+                # local version together so group them with a | and make them optional.
+                (?:
+                    \.\*  # Wild card syntax of .*
+                    |
+                    (?:                                  # pre release
+                        [-_\.]?
+                        (alpha|beta|preview|pre|a|b|c|rc)
+                        [-_\.]?
+                        [0-9]*
+                    )?
+                    (?:                                  # post release
+                        (?:-[0-9]+)|(?:[-_\.]?(post|rev|r)[-_\.]?[0-9]*)
+                    )?
+                    (?:[-_\.]?dev[-_\.]?[0-9]*)?         # dev release
+                    (?:\+[a-z0-9]+(?:[-_\.][a-z0-9]+)*)? # local
+                )?
+            )
+            |
+            (?:
+                # The compatible operator requires at least two digits in the
+                # release segment.
+                (?<=~=)               # Only match for the compatible operator
+                \s*
+                v?
+                (?:[0-9]+!)?          # epoch
+                [0-9]+(?:\.[0-9]+)+   # release  (We have a + instead of a *)
+                (?:                   # pre release
+                    [-_\.]?
+                    (alpha|beta|preview|pre|a|b|c|rc)
+                    [-_\.]?
+                    [0-9]*
+                )?
+                (?:                                   # post release
+                    (?:-[0-9]+)|(?:[-_\.]?(post|rev|r)[-_\.]?[0-9]*)
+                )?
+                (?:[-_\.]?dev[-_\.]?[0-9]*)?          # dev release
+            )
+            |
+            (?:
+                # All other operators only allow a sub set of what the
+                # (non)equality operators do. Specifically they do not allow
+                # local versions to be specified nor do they allow the prefix
+                # matching wild cards.
+                (?<!==|!=|~=)         # We have special cases for these
+                                      # operators so we want to make sure they
+                                      # don't match here.
+                \s*
+                v?
+                (?:[0-9]+!)?          # epoch
+                [0-9]+(?:\.[0-9]+)*   # release
+                (?:                   # pre release
+                    [-_\.]?
+                    (alpha|beta|preview|pre|a|b|c|rc)
+                    [-_\.]?
+                    [0-9]*
+                )?
+                (?:                                   # post release
+                    (?:-[0-9]+)|(?:[-_\.]?(post|rev|r)[-_\.]?[0-9]*)
+                )?
+                (?:[-_\.]?dev[-_\.]?[0-9]*)?          # dev release
+            )
+        )
+        """
+    _regex = re.compile(
+        r"^\s*" + _operator_regex_str + _version_regex_str + r"\s*$",
+        re.VERBOSE | re.IGNORECASE,
+    )
+    _operators = {
+        "~=": "compatible",
+        "==": "equal",
+        "!=": "not_equal",
+        "<=": "less_than_equal",
+        ">=": "greater_than_equal",
+        "<": "less_than",
+        ">": "greater_than",
+        "===": "arbitrary",
+    }
+    def __init__(self, spec: str = "", prereleases: Optional[bool] = None) -> None:
+        """Initialize a Specifier instance.
+        :param spec:
+            The string representation of a specifier which will be parsed and
+            normalized before use.
+        :param prereleases:
+            This tells the specifier if it should accept prerelease versions if
+            applicable or not. The default of ``None`` will autodetect it from the
+            given specifiers.
+        :raises InvalidSpecifier:
+            If the given specifier is invalid (i.e. bad syntax).
+        """
+        match = self._regex.search(spec)
+        if not match:
+            raise InvalidSpecifier(f"Invalid specifier: '{spec}'")
+        self._spec: Tuple[str, str] = (
+            match.group("operator").strip(),
+            match.group("version").strip(),
+        )
+        # Store whether or not this Specifier should accept prereleases
+        self._prereleases = prereleases
+    # https://github.com/python/mypy/pull/13475#pullrequestreview-1079784515
+    @property  # type: ignore[override]
+    def prereleases(self) -> bool:
+        # If there is an explicit prereleases set for this, then we'll just
+        # blindly use that.
+        if self._prereleases is not None:
+            return self._prereleases
+        # Look at all of our specifiers and determine if they are inclusive
+        # operators, and if they are if they are including an explicit
+        # prerelease.
+        operator, version = self._spec
+        if operator in ["==", ">=", "<=", "~=", "==="]:
+            # The == specifier can include a trailing .*, if it does we
+            # want to remove before parsing.
+            if operator == "==" and version.endswith(".*"):
+                version = version[:-2]
+            # Parse the version, and if it is a pre-release than this
+            # specifier allows pre-releases.
+            if Version(version).is_prerelease:
+                return True
+        return False
+    @prereleases.setter
+    def prereleases(self, value: bool) -> None:
+        self._prereleases = value
+    @property
+    def operator(self) -> str:
+        """The operator of this specifier.
+        >>> Specifier("==1.2.3").operator
+        '=='
+        """
+        return self._spec[0]
+    @property
+    def version(self) -> str:
+        """The version of this specifier.
+        >>> Specifier("==1.2.3").version
+        '1.2.3'
+        """
+        return self._spec[1]
+    def __repr__(self) -> str:
+        """A representation of the Specifier that shows all internal state.
+        >>> Specifier('>=1.0.0')
+        <Specifier('>=1.0.0')>
+        >>> Specifier('>=1.0.0', prereleases=False)
+        <Specifier('>=1.0.0', prereleases=False)>
+        >>> Specifier('>=1.0.0', prereleases=True)
+        <Specifier('>=1.0.0', prereleases=True)>
+        """
+        pre = (
+            f", prereleases={self.prereleases!r}"
+            if self._prereleases is not None
+            else ""
+        )
+        return f"<{self.__class__.__name__}({str(self)!r}{pre})>"
+    def __str__(self) -> str:
+        """A string representation of the Specifier that can be round-tripped.
+        >>> str(Specifier('>=1.0.0'))
+        '>=1.0.0'
+        >>> str(Specifier('>=1.0.0', prereleases=False))
+        '>=1.0.0'
+        """
+        return "{}{}".format(*self._spec)
+    @property
+    def _canonical_spec(self) -> Tuple[str, str]:
+        canonical_version = canonicalize_version(
+            self._spec[1],
+            strip_trailing_zero=(self._spec[0] != "~="),
+        )
+        return self._spec[0], canonical_version
+    def __hash__(self) -> int:
+        return hash(self._canonical_spec)
+    def __eq__(self, other: object) -> bool:
+        """Whether or not the two Specifier-like objects are equal.
+        :param other: The other object to check against.
+        The value of :attr:`prereleases` is ignored.
+        >>> Specifier("==1.2.3") == Specifier("== 1.2.3.0")
+        True
+        >>> (Specifier("==1.2.3", prereleases=False) ==
+        ...  Specifier("==1.2.3", prereleases=True))
+        True
+        >>> Specifier("==1.2.3") == "==1.2.3"
+        True
+        >>> Specifier("==1.2.3") == Specifier("==1.2.4")
+        False
+        >>> Specifier("==1.2.3") == Specifier("~=1.2.3")
+        False
+        """
+        if isinstance(other, str):
+            try:
+                other = self.__class__(str(other))
+            except InvalidSpecifier:
+                return NotImplemented
+        elif not isinstance(other, self.__class__):
+            return NotImplemented
+        return self._canonical_spec == other._canonical_spec
+    def _get_operator(self, op: str) -> CallableOperator:
+        operator_callable: CallableOperator = getattr(
+            self, f"_compare_{self._operators[op]}"
+        )
+        return operator_callable
+    def _compare_compatible(self, prospective: Version, spec: str) -> bool:
+        # Compatible releases have an equivalent combination of >= and ==. That
+        # is that ~=2.2 is equivalent to >=2.2,==2.*. This allows us to
+        # implement this in terms of the other specifiers instead of
+        # implementing it ourselves. The only thing we need to do is construct
+        # the other specifiers.
+        # We want everything but the last item in the version, but we want to
+        # ignore suffix segments.
+        prefix = _version_join(
+            list(itertools.takewhile(_is_not_suffix, _version_split(spec)))[:-1]
+        )
+        # Add the prefix notation to the end of our string
+        prefix += ".*"
+        return self._get_operator(">=")(prospective, spec) and self._get_operator("==")(
+            prospective, prefix
+        )
+    def _compare_equal(self, prospective: Version, spec: str) -> bool:
+        # We need special logic to handle prefix matching
+        if spec.endswith(".*"):
+            # In the case of prefix matching we want to ignore local segment.
+            normalized_prospective = canonicalize_version(
+                prospective.public, strip_trailing_zero=False
+            )
+            # Get the normalized version string ignoring the trailing .*
+            normalized_spec = canonicalize_version(spec[:-2], strip_trailing_zero=False)
+            # Split the spec out by bangs and dots, and pretend that there is
+            # an implicit dot in between a release segment and a pre-release segment.
+            split_spec = _version_split(normalized_spec)
+            # Split the prospective version out by bangs and dots, and pretend
+            # that there is an implicit dot in between a release segment and
+            # a pre-release segment.
+            split_prospective = _version_split(normalized_prospective)
+            # 0-pad the prospective version before shortening it to get the correct
+            # shortened version.
+            padded_prospective, _ = _pad_version(split_prospective, split_spec)
+            # Shorten the prospective version to be the same length as the spec
+            # so that we can determine if the specifier is a prefix of the
+            # prospective version or not.
+            shortened_prospective = padded_prospective[: len(split_spec)]
+            return shortened_prospective == split_spec
+        else:
+            # Convert our spec string into a Version
+            spec_version = Version(spec)
+            # If the specifier does not have a local segment, then we want to
+            # act as if the prospective version also does not have a local
+            # segment.
+            if not spec_version.local:
+                prospective = Version(prospective.public)
+            return prospective == spec_version
+    def _compare_not_equal(self, prospective: Version, spec: str) -> bool:
+        return not self._compare_equal(prospective, spec)
+    def _compare_less_than_equal(self, prospective: Version, spec: str) -> bool:
+        # NB: Local version identifiers are NOT permitted in the version
+        # specifier, so local version labels can be universally removed from
+        # the prospective version.
+        return Version(prospective.public) <= Version(spec)
+    def _compare_greater_than_equal(self, prospective: Version, spec: str) -> bool:
+        # NB: Local version identifiers are NOT permitted in the version
+        # specifier, so local version labels can be universally removed from
+        # the prospective version.
+        return Version(prospective.public) >= Version(spec)
+    def _compare_less_than(self, prospective: Version, spec_str: str) -> bool:
+        # Convert our spec to a Version instance, since we'll want to work with
+        # it as a version.
+        spec = Version(spec_str)
+        # Check to see if the prospective version is less than the spec
+        # version. If it's not we can short circuit and just return False now
+        # instead of doing extra unneeded work.
+        if not prospective < spec:
+            return False
+        # This special case is here so that, unless the specifier itself
+        # includes is a pre-release version, that we do not accept pre-release
+        # versions for the version mentioned in the specifier (e.g. <3.1 should
+        # not match 3.1.dev0, but should match 3.0.dev0).
+        if not spec.is_prerelease and prospective.is_prerelease:
+            if Version(prospective.base_version) == Version(spec.base_version):
+                return False
+        # If we've gotten to here, it means that prospective version is both
+        # less than the spec version *and* it's not a pre-release of the same
+        # version in the spec.
+        return True
+    def _compare_greater_than(self, prospective: Version, spec_str: str) -> bool:
+        # Convert our spec to a Version instance, since we'll want to work with
+        # it as a version.
+        spec = Version(spec_str)
+        # Check to see if the prospective version is greater than the spec
+        # version. If it's not we can short circuit and just return False now
+        # instead of doing extra unneeded work.
+        if not prospective > spec:
+            return False
+        # This special case is here so that, unless the specifier itself
+        # includes is a post-release version, that we do not accept
+        # post-release versions for the version mentioned in the specifier
+        # (e.g. >3.1 should not match 3.0.post0, but should match 3.2.post0).
+        if not spec.is_postrelease and prospective.is_postrelease:
+            if Version(prospective.base_version) == Version(spec.base_version):
+                return False
+        # Ensure that we do not allow a local version of the version mentioned
+        # in the specifier, which is technically greater than, to match.
+        if prospective.local is not None:
+            if Version(prospective.base_version) == Version(spec.base_version):
+                return False
+        # If we've gotten to here, it means that prospective version is both
+        # greater than the spec version *and* it's not a pre-release of the
+        # same version in the spec.
+        return True
+    def _compare_arbitrary(self, prospective: Version, spec: str) -> bool:
+        return str(prospective).lower() == str(spec).lower()
+    def __contains__(self, item: Union[str, Version]) -> bool:
+        """Return whether or not the item is contained in this specifier.
+        :param item: The item to check for.
+        This is used for the ``in`` operator and behaves the same as
+        :meth:`contains` with no ``prereleases`` argument passed.
+        >>> "1.2.3" in Specifier(">=1.2.3")
+        True
+        >>> Version("1.2.3") in Specifier(">=1.2.3")
+        True
+        >>> "1.0.0" in Specifier(">=1.2.3")
+        False
+        >>> "1.3.0a1" in Specifier(">=1.2.3")
+        False
+        >>> "1.3.0a1" in Specifier(">=1.2.3", prereleases=True)
+        True
+        """
+        return self.contains(item)
+    def contains(
+        self, item: UnparsedVersion, prereleases: Optional[bool] = None
+    ) -> bool:
+        """Return whether or not the item is contained in this specifier.
+        :param item:
+            The item to check for, which can be a version string or a
+            :class:`Version` instance.
+        :param prereleases:
+            Whether or not to match prereleases with this Specifier. If set to
+            ``None`` (the default), it uses :attr:`prereleases` to determine
+            whether or not prereleases are allowed.
+        >>> Specifier(">=1.2.3").contains("1.2.3")
+        True
+        >>> Specifier(">=1.2.3").contains(Version("1.2.3"))
+        True
+        >>> Specifier(">=1.2.3").contains("1.0.0")
+        False
+        >>> Specifier(">=1.2.3").contains("1.3.0a1")
+        False
+        >>> Specifier(">=1.2.3", prereleases=True).contains("1.3.0a1")
+        True
+        >>> Specifier(">=1.2.3").contains("1.3.0a1", prereleases=True)
+        True
+        """
+        # Determine if prereleases are to be allowed or not.
+        if prereleases is None:
+            prereleases = self.prereleases
+        # Normalize item to a Version, this allows us to have a shortcut for
+        # "2.0" in Specifier(">=2")
+        normalized_item = _coerce_version(item)
+        # Determine if we should be supporting prereleases in this specifier
+        # or not, if we do not support prereleases than we can short circuit
+        # logic if this version is a prereleases.
+        if normalized_item.is_prerelease and not prereleases:
+            return False
+        # Actually do the comparison to determine if this item is contained
+        # within this Specifier or not.
+        operator_callable: CallableOperator = self._get_operator(self.operator)
+        return operator_callable(normalized_item, self.version)
+    def filter(
+        self, iterable: Iterable[UnparsedVersionVar], prereleases: Optional[bool] = None
+    ) -> Iterator[UnparsedVersionVar]:
+        """Filter items in the given iterable, that match the specifier.
+        :param iterable:
+            An iterable that can contain version strings and :class:`Version` instances.
+            The items in the iterable will be filtered according to the specifier.
+        :param prereleases:
+            Whether or not to allow prereleases in the returned iterator. If set to
+            ``None`` (the default), it will be intelligently decide whether to allow
+            prereleases or not (based on the :attr:`prereleases` attribute, and
+            whether the only versions matching are prereleases).
+        This method is smarter than just ``filter(Specifier().contains, [...])``
+        because it implements the rule from :pep:`440` that a prerelease item
+        SHOULD be accepted if no other versions match the given specifier.
+        >>> list(Specifier(">=1.2.3").filter(["1.2", "1.3", "1.5a1"]))
+        ['1.3']
+        >>> list(Specifier(">=1.2.3").filter(["1.2", "1.2.3", "1.3", Version("1.4")]))
+        ['1.2.3', '1.3', <Version('1.4')>]
+        >>> list(Specifier(">=1.2.3").filter(["1.2", "1.5a1"]))
+        ['1.5a1']
+        >>> list(Specifier(">=1.2.3").filter(["1.3", "1.5a1"], prereleases=True))
+        ['1.3', '1.5a1']
+        >>> list(Specifier(">=1.2.3", prereleases=True).filter(["1.3", "1.5a1"]))
+        ['1.3', '1.5a1']
+        """
+        yielded = False
+        found_prereleases = []
+        kw = {"prereleases": prereleases if prereleases is not None else True}
+        # Attempt to iterate over all the values in the iterable and if any of
+        # them match, yield them.
+        for version in iterable:
+            parsed_version = _coerce_version(version)
+            if self.contains(parsed_version, **kw):
+                # If our version is a prerelease, and we were not set to allow
+                # prereleases, then we'll store it for later in case nothing
+                # else matches this specifier.
+                if parsed_version.is_prerelease and not (
+                    prereleases or self.prereleases
+                ):
+                    found_prereleases.append(version)
+                # Either this is not a prerelease, or we should have been
+                # accepting prereleases from the beginning.
+                else:
+                    yielded = True
+                    yield version
+        # Now that we've iterated over everything, determine if we've yielded
+        # any values, and if we have not and we have any prereleases stored up
+        # then we will go ahead and yield the prereleases.
+        if not yielded and found_prereleases:
+            for version in found_prereleases:
+                yield version
+_prefix_regex = re.compile(r"^([0-9]+)((?:a|b|c|rc)[0-9]+)$")
+def _version_split(version: str) -> List[str]:
+    """Split version into components.
+    The split components are intended for version comparison. The logic does
+    not attempt to retain the original version string, so joining the
+    components back with :func:`_version_join` may not produce the original
+    version string.
+    """
+    result: List[str] = []
+    epoch, _, rest = version.rpartition("!")
+    result.append(epoch or "0")
+    for item in rest.split("."):
+        match = _prefix_regex.search(item)
+        if match:
+            result.extend(match.groups())
+        else:
+            result.append(item)
+    return result
+def _version_join(components: List[str]) -> str:
+    """Join split version components into a version string.
+    This function assumes the input came from :func:`_version_split`, where the
+    first component must be the epoch (either empty or numeric), and all other
+    components numeric.
+    """
+    epoch, *rest = components
+    return f"{epoch}!{'.'.join(rest)}"
+def _is_not_suffix(segment: str) -> bool:
+    return not any(
+        segment.startswith(prefix) for prefix in ("dev", "a", "b", "rc", "post")
+    )
+def _pad_version(left: List[str], right: List[str]) -> Tuple[List[str], List[str]]:
+    left_split, right_split = [], []
+    # Get the release segment of our versions
+    left_split.append(list(itertools.takewhile(lambda x: x.isdigit(), left)))
+    right_split.append(list(itertools.takewhile(lambda x: x.isdigit(), right)))
+    # Get the rest of our versions
+    left_split.append(left[len(left_split[0]) :])
+    right_split.append(right[len(right_split[0]) :])
+    # Insert our padding
+    left_split.insert(1, ["0"] * max(0, len(right_split[0]) - len(left_split[0])))
+    right_split.insert(1, ["0"] * max(0, len(left_split[0]) - len(right_split[0])))
+    return (
+        list(itertools.chain.from_iterable(left_split)),
+        list(itertools.chain.from_iterable(right_split)),
+    )
+class SpecifierSet(BaseSpecifier):
+    """This class abstracts handling of a set of version specifiers.
+    It can be passed a single specifier (``>=3.0``), a comma-separated list of
+    specifiers (``>=3.0,!=3.1``), or no specifier at all.
+    """
+    def __init__(
+        self, specifiers: str = "", prereleases: Optional[bool] = None
+    ) -> None:
+        """Initialize a SpecifierSet instance.
+        :param specifiers:
+            The string representation of a specifier or a comma-separated list of
+            specifiers which will be parsed and normalized before use.
+        :param prereleases:
+            This tells the SpecifierSet if it should accept prerelease versions if
+            applicable or not. The default of ``None`` will autodetect it from the
+            given specifiers.
+        :raises InvalidSpecifier:
+            If the given ``specifiers`` are not parseable than this exception will be
+            raised.
+        """
+        # Split on `,` to break each individual specifier into it's own item, and
+        # strip each item to remove leading/trailing whitespace.
+        split_specifiers = [s.strip() for s in specifiers.split(",") if s.strip()]
+        # Make each individual specifier a Specifier and save in a frozen set for later.
+        self._specs = frozenset(map(Specifier, split_specifiers))
+        # Store our prereleases value so we can use it later to determine if
+        # we accept prereleases or not.
+        self._prereleases = prereleases
+    @property
+    def prereleases(self) -> Optional[bool]:
+        # If we have been given an explicit prerelease modifier, then we'll
+        # pass that through here.
+        if self._prereleases is not None:
+            return self._prereleases
+        # If we don't have any specifiers, and we don't have a forced value,
+        # then we'll just return None since we don't know if this should have
+        # pre-releases or not.
+        if not self._specs:
+            return None
+        # Otherwise we'll see if any of the given specifiers accept
+        # prereleases, if any of them do we'll return True, otherwise False.
+        return any(s.prereleases for s in self._specs)
+    @prereleases.setter
+    def prereleases(self, value: bool) -> None:
+        self._prereleases = value
+    def __repr__(self) -> str:
+        """A representation of the specifier set that shows all internal state.
+        Note that the ordering of the individual specifiers within the set may not
+        match the input string.
+        >>> SpecifierSet('>=1.0.0,!=2.0.0')
+        <SpecifierSet('!=2.0.0,>=1.0.0')>
+        >>> SpecifierSet('>=1.0.0,!=2.0.0', prereleases=False)
+        <SpecifierSet('!=2.0.0,>=1.0.0', prereleases=False)>
+        >>> SpecifierSet('>=1.0.0,!=2.0.0', prereleases=True)
+        <SpecifierSet('!=2.0.0,>=1.0.0', prereleases=True)>
+        """
+        pre = (
+            f", prereleases={self.prereleases!r}"
+            if self._prereleases is not None
+            else ""
+        )
+        return f"<SpecifierSet({str(self)!r}{pre})>"
+    def __str__(self) -> str:
+        """A string representation of the specifier set that can be round-tripped.
+        Note that the ordering of the individual specifiers within the set may not
+        match the input string.
+        >>> str(SpecifierSet(">=1.0.0,!=1.0.1"))
+        '!=1.0.1,>=1.0.0'
+        >>> str(SpecifierSet(">=1.0.0,!=1.0.1", prereleases=False))
+        '!=1.0.1,>=1.0.0'
+        """
+        return ",".join(sorted(str(s) for s in self._specs))
+    def __hash__(self) -> int:
+        return hash(self._specs)
+    def __and__(self, other: Union["SpecifierSet", str]) -> "SpecifierSet":
+        """Return a SpecifierSet which is a combination of the two sets.
+        :param other: The other object to combine with.
+        >>> SpecifierSet(">=1.0.0,!=1.0.1") & '<=2.0.0,!=2.0.1'
+        <SpecifierSet('!=1.0.1,!=2.0.1,<=2.0.0,>=1.0.0')>
+        >>> SpecifierSet(">=1.0.0,!=1.0.1") & SpecifierSet('<=2.0.0,!=2.0.1')
+        <SpecifierSet('!=1.0.1,!=2.0.1,<=2.0.0,>=1.0.0')>
+        """
+        if isinstance(other, str):
+            other = SpecifierSet(other)
+        elif not isinstance(other, SpecifierSet):
+            return NotImplemented
+        specifier = SpecifierSet()
+        specifier._specs = frozenset(self._specs | other._specs)
+        if self._prereleases is None and other._prereleases is not None:
+            specifier._prereleases = other._prereleases
+        elif self._prereleases is not None and other._prereleases is None:
+            specifier._prereleases = self._prereleases
+        elif self._prereleases == other._prereleases:
+            specifier._prereleases = self._prereleases
+        else:
+            raise ValueError(
+                "Cannot combine SpecifierSets with True and False prerelease "
+                "overrides."
+            )
+        return specifier
+    def __eq__(self, other: object) -> bool:
+        """Whether or not the two SpecifierSet-like objects are equal.
+        :param other: The other object to check against.
+        The value of :attr:`prereleases` is ignored.
+        >>> SpecifierSet(">=1.0.0,!=1.0.1") == SpecifierSet(">=1.0.0,!=1.0.1")
+        True
+        >>> (SpecifierSet(">=1.0.0,!=1.0.1", prereleases=False) ==
+        ...  SpecifierSet(">=1.0.0,!=1.0.1", prereleases=True))
+        True
+        >>> SpecifierSet(">=1.0.0,!=1.0.1") == ">=1.0.0,!=1.0.1"
+        True
+        >>> SpecifierSet(">=1.0.0,!=1.0.1") == SpecifierSet(">=1.0.0")
+        False
+        >>> SpecifierSet(">=1.0.0,!=1.0.1") == SpecifierSet(">=1.0.0,!=1.0.2")
+        False
+        """
+        if isinstance(other, (str, Specifier)):
+            other = SpecifierSet(str(other))
+        elif not isinstance(other, SpecifierSet):
+            return NotImplemented
+        return self._specs == other._specs
+    def __len__(self) -> int:
+        """Returns the number of specifiers in this specifier set."""
+        return len(self._specs)
+    def __iter__(self) -> Iterator[Specifier]:
+        """
+        Returns an iterator over all the underlying :class:`Specifier` instances
+        in this specifier set.
+        >>> sorted(SpecifierSet(">=1.0.0,!=1.0.1"), key=str)
+        [<Specifier('!=1.0.1')>, <Specifier('>=1.0.0')>]
+        """
+        return iter(self._specs)
+    def __contains__(self, item: UnparsedVersion) -> bool:
+        """Return whether or not the item is contained in this specifier.
+        :param item: The item to check for.
+        This is used for the ``in`` operator and behaves the same as
+        :meth:`contains` with no ``prereleases`` argument passed.
+        >>> "1.2.3" in SpecifierSet(">=1.0.0,!=1.0.1")
+        True
+        >>> Version("1.2.3") in SpecifierSet(">=1.0.0,!=1.0.1")
+        True
+        >>> "1.0.1" in SpecifierSet(">=1.0.0,!=1.0.1")
+        False
+        >>> "1.3.0a1" in SpecifierSet(">=1.0.0,!=1.0.1")
+        False
+        >>> "1.3.0a1" in SpecifierSet(">=1.0.0,!=1.0.1", prereleases=True)
+        True
+        """
+        return self.contains(item)
+    def contains(
+        self,
+        item: UnparsedVersion,
+        prereleases: Optional[bool] = None,
+        installed: Optional[bool] = None,
+    ) -> bool:
+        """Return whether or not the item is contained in this SpecifierSet.
+        :param item:
+            The item to check for, which can be a version string or a
+            :class:`Version` instance.
+        :param prereleases:
+            Whether or not to match prereleases with this SpecifierSet. If set to
+            ``None`` (the default), it uses :attr:`prereleases` to determine
+            whether or not prereleases are allowed.
+        >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.2.3")
+        True
+        >>> SpecifierSet(">=1.0.0,!=1.0.1").contains(Version("1.2.3"))
+        True
+        >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.0.1")
+        False
+        >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.3.0a1")
+        False
+        >>> SpecifierSet(">=1.0.0,!=1.0.1", prereleases=True).contains("1.3.0a1")
+        True
+        >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.3.0a1", prereleases=True)
+        True
+        """
+        # Ensure that our item is a Version instance.
+        if not isinstance(item, Version):
+            item = Version(item)
+        # Determine if we're forcing a prerelease or not, if we're not forcing
+        # one for this particular filter call, then we'll use whatever the
+        # SpecifierSet thinks for whether or not we should support prereleases.
+        if prereleases is None:
+            prereleases = self.prereleases
+        # We can determine if we're going to allow pre-releases by looking to
+        # see if any of the underlying items supports them. If none of them do
+        # and this item is a pre-release then we do not allow it and we can
+        # short circuit that here.
+        # Note: This means that 1.0.dev1 would not be contained in something
+        #       like >=1.0.devabc however it would be in >=1.0.debabc,>0.0.dev0
+        if not prereleases and item.is_prerelease:
+            return False
+        if installed and item.is_prerelease:
+            item = Version(item.base_version)
+        # We simply dispatch to the underlying specs here to make sure that the
+        # given version is contained within all of them.
+        # Note: This use of all() here means that an empty set of specifiers
+        #       will always return True, this is an explicit design decision.
+        return all(s.contains(item, prereleases=prereleases) for s in self._specs)
+    def filter(
+        self, iterable: Iterable[UnparsedVersionVar], prereleases: Optional[bool] = None
+    ) -> Iterator[UnparsedVersionVar]:
+        """Filter items in the given iterable, that match the specifiers in this set.
+        :param iterable:
+            An iterable that can contain version strings and :class:`Version` instances.
+            The items in the iterable will be filtered according to the specifier.
+        :param prereleases:
+            Whether or not to allow prereleases in the returned iterator. If set to
+            ``None`` (the default), it will be intelligently decide whether to allow
+            prereleases or not (based on the :attr:`prereleases` attribute, and
+            whether the only versions matching are prereleases).
+        This method is smarter than just ``filter(SpecifierSet(...).contains, [...])``
+        because it implements the rule from :pep:`440` that a prerelease item
+        SHOULD be accepted if no other versions match the given specifier.
+        >>> list(SpecifierSet(">=1.2.3").filter(["1.2", "1.3", "1.5a1"]))
+        ['1.3']
+        >>> list(SpecifierSet(">=1.2.3").filter(["1.2", "1.3", Version("1.4")]))
+        ['1.3', <Version('1.4')>]
+        >>> list(SpecifierSet(">=1.2.3").filter(["1.2", "1.5a1"]))
+        []
+        >>> list(SpecifierSet(">=1.2.3").filter(["1.3", "1.5a1"], prereleases=True))
+        ['1.3', '1.5a1']
+        >>> list(SpecifierSet(">=1.2.3", prereleases=True).filter(["1.3", "1.5a1"]))
+        ['1.3', '1.5a1']
+        An "empty" SpecifierSet will filter items based on the presence of prerelease
+        versions in the set.
+        >>> list(SpecifierSet("").filter(["1.3", "1.5a1"]))
+        ['1.3']
+        >>> list(SpecifierSet("").filter(["1.5a1"]))
+        ['1.5a1']
+        >>> list(SpecifierSet("", prereleases=True).filter(["1.3", "1.5a1"]))
+        ['1.3', '1.5a1']
+        >>> list(SpecifierSet("").filter(["1.3", "1.5a1"], prereleases=True))
+        ['1.3', '1.5a1']
+        """
+        # Determine if we're forcing a prerelease or not, if we're not forcing
+        # one for this particular filter call, then we'll use whatever the
+        # SpecifierSet thinks for whether or not we should support prereleases.
+        if prereleases is None:
+            prereleases = self.prereleases
+        # If we have any specifiers, then we want to wrap our iterable in the
+        # filter method for each one, this will act as a logical AND amongst
+        # each specifier.
+        if self._specs:
+            for spec in self._specs:
+                iterable = spec.filter(iterable, prereleases=bool(prereleases))
+            return iter(iterable)
+        # If we do not have any specifiers, then we need to have a rough filter
+        # which will filter out any pre-releases, unless there are no final
+        # releases.
+        else:
+            filtered: List[UnparsedVersionVar] = []
+            found_prereleases: List[UnparsedVersionVar] = []
+            for item in iterable:
+                parsed_version = _coerce_version(item)
+                # Store any item which is a pre-release for later unless we've
+                # already found a final version or we are accepting prereleases
+                if parsed_version.is_prerelease and not prereleases:
+                    if not filtered:
+                        found_prereleases.append(item)
+                else:
+                    filtered.append(item)
+            # If we've found no items except for pre-releases, then we'll go
+            # ahead and use the pre-releases
+            if not filtered and found_prereleases and prereleases is None:
+                return iter(found_prereleases)
+            return iter(filtered)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/tags.py ADDED Viewed

	@@ -0,0 +1,571 @@

+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+import logging
+import platform
+import re
+import struct
+import subprocess
+import sys
+import sysconfig
+from importlib.machinery import EXTENSION_SUFFIXES
+from typing import (
+    Dict,
+    FrozenSet,
+    Iterable,
+    Iterator,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    cast,
+)
+from . import _manylinux, _musllinux
+logger = logging.getLogger(__name__)
+PythonVersion = Sequence[int]
+MacVersion = Tuple[int, int]
+INTERPRETER_SHORT_NAMES: Dict[str, str] = {
+    "python": "py",  # Generic.
+    "cpython": "cp",
+    "pypy": "pp",
+    "ironpython": "ip",
+    "jython": "jy",
+}
+_32_BIT_INTERPRETER = struct.calcsize("P") == 4
+class Tag:
+    """
+    A representation of the tag triple for a wheel.
+    Instances are considered immutable and thus are hashable. Equality checking
+    is also supported.
+    """
+    __slots__ = ["_interpreter", "_abi", "_platform", "_hash"]
+    def __init__(self, interpreter: str, abi: str, platform: str) -> None:
+        self._interpreter = interpreter.lower()
+        self._abi = abi.lower()
+        self._platform = platform.lower()
+        # The __hash__ of every single element in a Set[Tag] will be evaluated each time
+        # that a set calls its `.disjoint()` method, which may be called hundreds of
+        # times when scanning a page of links for packages with tags matching that
+        # Set[Tag]. Pre-computing the value here produces significant speedups for
+        # downstream consumers.
+        self._hash = hash((self._interpreter, self._abi, self._platform))
+    @property
+    def interpreter(self) -> str:
+        return self._interpreter
+    @property
+    def abi(self) -> str:
+        return self._abi
+    @property
+    def platform(self) -> str:
+        return self._platform
+    def __eq__(self, other: object) -> bool:
+        if not isinstance(other, Tag):
+            return NotImplemented
+        return (
+            (self._hash == other._hash)  # Short-circuit ASAP for perf reasons.
+            and (self._platform == other._platform)
+            and (self._abi == other._abi)
+            and (self._interpreter == other._interpreter)
+        )
+    def __hash__(self) -> int:
+        return self._hash
+    def __str__(self) -> str:
+        return f"{self._interpreter}-{self._abi}-{self._platform}"
+    def __repr__(self) -> str:
+        return f"<{self} @ {id(self)}>"
+def parse_tag(tag: str) -> FrozenSet[Tag]:
+    """
+    Parses the provided tag (e.g. `py3-none-any`) into a frozenset of Tag instances.
+    Returning a set is required due to the possibility that the tag is a
+    compressed tag set.
+    """
+    tags = set()
+    interpreters, abis, platforms = tag.split("-")
+    for interpreter in interpreters.split("."):
+        for abi in abis.split("."):
+            for platform_ in platforms.split("."):
+                tags.add(Tag(interpreter, abi, platform_))
+    return frozenset(tags)
+def _get_config_var(name: str, warn: bool = False) -> Union[int, str, None]:
+    value: Union[int, str, None] = sysconfig.get_config_var(name)
+    if value is None and warn:
+        logger.debug(
+            "Config variable '%s' is unset, Python ABI tag may be incorrect", name
+        )
+    return value
+def _normalize_string(string: str) -> str:
+    return string.replace(".", "_").replace("-", "_").replace(" ", "_")
+def _is_threaded_cpython(abis: List[str]) -> bool:
+    """
+    Determine if the ABI corresponds to a threaded (`--disable-gil`) build.
+    The threaded builds are indicated by a "t" in the abiflags.
+    """
+    if len(abis) == 0:
+        return False
+    # expect e.g., cp313
+    m = re.match(r"cp\d+(.*)", abis[0])
+    if not m:
+        return False
+    abiflags = m.group(1)
+    return "t" in abiflags
+def _abi3_applies(python_version: PythonVersion, threading: bool) -> bool:
+    """
+    Determine if the Python version supports abi3.
+    PEP 384 was first implemented in Python 3.2. The threaded (`--disable-gil`)
+    builds do not support abi3.
+    """
+    return len(python_version) > 1 and tuple(python_version) >= (3, 2) and not threading
+def _cpython_abis(py_version: PythonVersion, warn: bool = False) -> List[str]:
+    py_version = tuple(py_version)  # To allow for version comparison.
+    abis = []
+    version = _version_nodot(py_version[:2])
+    threading = debug = pymalloc = ucs4 = ""
+    with_debug = _get_config_var("Py_DEBUG", warn)
+    has_refcount = hasattr(sys, "gettotalrefcount")
+    # Windows doesn't set Py_DEBUG, so checking for support of debug-compiled
+    # extension modules is the best option.
+    # https://github.com/pypa/pip/issues/3383#issuecomment-173267692
+    has_ext = "_d.pyd" in EXTENSION_SUFFIXES
+    if with_debug or (with_debug is None and (has_refcount or has_ext)):
+        debug = "d"
+    if py_version >= (3, 13) and _get_config_var("Py_GIL_DISABLED", warn):
+        threading = "t"
+    if py_version < (3, 8):
+        with_pymalloc = _get_config_var("WITH_PYMALLOC", warn)
+        if with_pymalloc or with_pymalloc is None:
+            pymalloc = "m"
+        if py_version < (3, 3):
+            unicode_size = _get_config_var("Py_UNICODE_SIZE", warn)
+            if unicode_size == 4 or (
+                unicode_size is None and sys.maxunicode == 0x10FFFF
+            ):
+                ucs4 = "u"
+    elif debug:
+        # Debug builds can also load "normal" extension modules.
+        # We can also assume no UCS-4 or pymalloc requirement.
+        abis.append(f"cp{version}{threading}")
+    abis.insert(0, f"cp{version}{threading}{debug}{pymalloc}{ucs4}")
+    return abis
+def cpython_tags(
+    python_version: Optional[PythonVersion] = None,
+    abis: Optional[Iterable[str]] = None,
+    platforms: Optional[Iterable[str]] = None,
+    *,
+    warn: bool = False,
+) -> Iterator[Tag]:
+    """
+    Yields the tags for a CPython interpreter.
+    The tags consist of:
+    - cp<python_version>-<abi>-<platform>
+    - cp<python_version>-abi3-<platform>
+    - cp<python_version>-none-<platform>
+    - cp<less than python_version>-abi3-<platform>  # Older Python versions down to 3.2.
+    If python_version only specifies a major version then user-provided ABIs and
+    the 'none' ABItag will be used.
+    If 'abi3' or 'none' are specified in 'abis' then they will be yielded at
+    their normal position and not at the beginning.
+    """
+    if not python_version:
+        python_version = sys.version_info[:2]
+    interpreter = f"cp{_version_nodot(python_version[:2])}"
+    if abis is None:
+        if len(python_version) > 1:
+            abis = _cpython_abis(python_version, warn)
+        else:
+            abis = []
+    abis = list(abis)
+    # 'abi3' and 'none' are explicitly handled later.
+    for explicit_abi in ("abi3", "none"):
+        try:
+            abis.remove(explicit_abi)
+        except ValueError:
+            pass
+    platforms = list(platforms or platform_tags())
+    for abi in abis:
+        for platform_ in platforms:
+            yield Tag(interpreter, abi, platform_)
+    threading = _is_threaded_cpython(abis)
+    use_abi3 = _abi3_applies(python_version, threading)
+    if use_abi3:
+        yield from (Tag(interpreter, "abi3", platform_) for platform_ in platforms)
+    yield from (Tag(interpreter, "none", platform_) for platform_ in platforms)
+    if use_abi3:
+        for minor_version in range(python_version[1] - 1, 1, -1):
+            for platform_ in platforms:
+                interpreter = "cp{version}".format(
+                    version=_version_nodot((python_version[0], minor_version))
+                )
+                yield Tag(interpreter, "abi3", platform_)
+def _generic_abi() -> List[str]:
+    """
+    Return the ABI tag based on EXT_SUFFIX.
+    """
+    # The following are examples of `EXT_SUFFIX`.
+    # We want to keep the parts which are related to the ABI and remove the
+    # parts which are related to the platform:
+    # - linux:   '.cpython-310-x86_64-linux-gnu.so' => cp310
+    # - mac:     '.cpython-310-darwin.so'           => cp310
+    # - win:     '.cp310-win_amd64.pyd'             => cp310
+    # - win:     '.pyd'                             => cp37 (uses _cpython_abis())
+    # - pypy:    '.pypy38-pp73-x86_64-linux-gnu.so' => pypy38_pp73
+    # - graalpy: '.graalpy-38-native-x86_64-darwin.dylib'
+    #                                               => graalpy_38_native
+    ext_suffix = _get_config_var("EXT_SUFFIX", warn=True)
+    if not isinstance(ext_suffix, str) or ext_suffix[0] != ".":
+        raise SystemError("invalid sysconfig.get_config_var('EXT_SUFFIX')")
+    parts = ext_suffix.split(".")
+    if len(parts) < 3:
+        # CPython3.7 and earlier uses ".pyd" on Windows.
+        return _cpython_abis(sys.version_info[:2])
+    soabi = parts[1]
+    if soabi.startswith("cpython"):
+        # non-windows
+        abi = "cp" + soabi.split("-")[1]
+    elif soabi.startswith("cp"):
+        # windows
+        abi = soabi.split("-")[0]
+    elif soabi.startswith("pypy"):
+        abi = "-".join(soabi.split("-")[:2])
+    elif soabi.startswith("graalpy"):
+        abi = "-".join(soabi.split("-")[:3])
+    elif soabi:
+        # pyston, ironpython, others?
+        abi = soabi
+    else:
+        return []
+    return [_normalize_string(abi)]
+def generic_tags(
+    interpreter: Optional[str] = None,
+    abis: Optional[Iterable[str]] = None,
+    platforms: Optional[Iterable[str]] = None,
+    *,
+    warn: bool = False,
+) -> Iterator[Tag]:
+    """
+    Yields the tags for a generic interpreter.
+    The tags consist of:
+    - <interpreter>-<abi>-<platform>
+    The "none" ABI will be added if it was not explicitly provided.
+    """
+    if not interpreter:
+        interp_name = interpreter_name()
+        interp_version = interpreter_version(warn=warn)
+        interpreter = "".join([interp_name, interp_version])
+    if abis is None:
+        abis = _generic_abi()
+    else:
+        abis = list(abis)
+    platforms = list(platforms or platform_tags())
+    if "none" not in abis:
+        abis.append("none")
+    for abi in abis:
+        for platform_ in platforms:
+            yield Tag(interpreter, abi, platform_)
+def _py_interpreter_range(py_version: PythonVersion) -> Iterator[str]:
+    """
+    Yields Python versions in descending order.
+    After the latest version, the major-only version will be yielded, and then
+    all previous versions of that major version.
+    """
+    if len(py_version) > 1:
+        yield f"py{_version_nodot(py_version[:2])}"
+    yield f"py{py_version[0]}"
+    if len(py_version) > 1:
+        for minor in range(py_version[1] - 1, -1, -1):
+            yield f"py{_version_nodot((py_version[0], minor))}"
+def compatible_tags(
+    python_version: Optional[PythonVersion] = None,
+    interpreter: Optional[str] = None,
+    platforms: Optional[Iterable[str]] = None,
+) -> Iterator[Tag]:
+    """
+    Yields the sequence of tags that are compatible with a specific version of Python.
+    The tags consist of:
+    - py*-none-<platform>
+    - <interpreter>-none-any  # ... if `interpreter` is provided.
+    - py*-none-any
+    """
+    if not python_version:
+        python_version = sys.version_info[:2]
+    platforms = list(platforms or platform_tags())
+    for version in _py_interpreter_range(python_version):
+        for platform_ in platforms:
+            yield Tag(version, "none", platform_)
+    if interpreter:
+        yield Tag(interpreter, "none", "any")
+    for version in _py_interpreter_range(python_version):
+        yield Tag(version, "none", "any")
+def _mac_arch(arch: str, is_32bit: bool = _32_BIT_INTERPRETER) -> str:
+    if not is_32bit:
+        return arch
+    if arch.startswith("ppc"):
+        return "ppc"
+    return "i386"
+def _mac_binary_formats(version: MacVersion, cpu_arch: str) -> List[str]:
+    formats = [cpu_arch]
+    if cpu_arch == "x86_64":
+        if version < (10, 4):
+            return []
+        formats.extend(["intel", "fat64", "fat32"])
+    elif cpu_arch == "i386":
+        if version < (10, 4):
+            return []
+        formats.extend(["intel", "fat32", "fat"])
+    elif cpu_arch == "ppc64":
+        # TODO: Need to care about 32-bit PPC for ppc64 through 10.2?
+        if version > (10, 5) or version < (10, 4):
+            return []
+        formats.append("fat64")
+    elif cpu_arch == "ppc":
+        if version > (10, 6):
+            return []
+        formats.extend(["fat32", "fat"])
+    if cpu_arch in {"arm64", "x86_64"}:
+        formats.append("universal2")
+    if cpu_arch in {"x86_64", "i386", "ppc64", "ppc", "intel"}:
+        formats.append("universal")
+    return formats
+def mac_platforms(
+    version: Optional[MacVersion] = None, arch: Optional[str] = None
+) -> Iterator[str]:
+    """
+    Yields the platform tags for a macOS system.
+    The `version` parameter is a two-item tuple specifying the macOS version to
+    generate platform tags for. The `arch` parameter is the CPU architecture to
+    generate platform tags for. Both parameters default to the appropriate value
+    for the current system.
+    """
+    version_str, _, cpu_arch = platform.mac_ver()
+    if version is None:
+        version = cast("MacVersion", tuple(map(int, version_str.split(".")[:2])))
+        if version == (10, 16):
+            # When built against an older macOS SDK, Python will report macOS 10.16
+            # instead of the real version.
+            version_str = subprocess.run(
+                [
+                    sys.executable,
+                    "-sS",
+                    "-c",
+                    "import platform; print(platform.mac_ver()[0])",
+                ],
+                check=True,
+                env={"SYSTEM_VERSION_COMPAT": "0"},
+                stdout=subprocess.PIPE,
+                text=True,
+            ).stdout
+            version = cast("MacVersion", tuple(map(int, version_str.split(".")[:2])))
+    else:
+        version = version
+    if arch is None:
+        arch = _mac_arch(cpu_arch)
+    else:
+        arch = arch
+    if (10, 0) <= version and version < (11, 0):
+        # Prior to Mac OS 11, each yearly release of Mac OS bumped the
+        # "minor" version number.  The major version was always 10.
+        for minor_version in range(version[1], -1, -1):
+            compat_version = 10, minor_version
+            binary_formats = _mac_binary_formats(compat_version, arch)
+            for binary_format in binary_formats:
+                yield "macosx_{major}_{minor}_{binary_format}".format(
+                    major=10, minor=minor_version, binary_format=binary_format
+                )
+    if version >= (11, 0):
+        # Starting with Mac OS 11, each yearly release bumps the major version
+        # number.   The minor versions are now the midyear updates.
+        for major_version in range(version[0], 10, -1):
+            compat_version = major_version, 0
+            binary_formats = _mac_binary_formats(compat_version, arch)
+            for binary_format in binary_formats:
+                yield "macosx_{major}_{minor}_{binary_format}".format(
+                    major=major_version, minor=0, binary_format=binary_format
+                )
+    if version >= (11, 0):
+        # Mac OS 11 on x86_64 is compatible with binaries from previous releases.
+        # Arm64 support was introduced in 11.0, so no Arm binaries from previous
+        # releases exist.
+        #
+        # However, the "universal2" binary format can have a
+        # macOS version earlier than 11.0 when the x86_64 part of the binary supports
+        # that version of macOS.
+        if arch == "x86_64":
+            for minor_version in range(16, 3, -1):
+                compat_version = 10, minor_version
+                binary_formats = _mac_binary_formats(compat_version, arch)
+                for binary_format in binary_formats:
+                    yield "macosx_{major}_{minor}_{binary_format}".format(
+                        major=compat_version[0],
+                        minor=compat_version[1],
+                        binary_format=binary_format,
+                    )
+        else:
+            for minor_version in range(16, 3, -1):
+                compat_version = 10, minor_version
+                binary_format = "universal2"
+                yield "macosx_{major}_{minor}_{binary_format}".format(
+                    major=compat_version[0],
+                    minor=compat_version[1],
+                    binary_format=binary_format,
+                )
+def _linux_platforms(is_32bit: bool = _32_BIT_INTERPRETER) -> Iterator[str]:
+    linux = _normalize_string(sysconfig.get_platform())
+    if not linux.startswith("linux_"):
+        # we should never be here, just yield the sysconfig one and return
+        yield linux
+        return
+    if is_32bit:
+        if linux == "linux_x86_64":
+            linux = "linux_i686"
+        elif linux == "linux_aarch64":
+            linux = "linux_armv8l"
+    _, arch = linux.split("_", 1)
+    archs = {"armv8l": ["armv8l", "armv7l"]}.get(arch, [arch])
+    yield from _manylinux.platform_tags(archs)
+    yield from _musllinux.platform_tags(archs)
+    for arch in archs:
+        yield f"linux_{arch}"
+def _generic_platforms() -> Iterator[str]:
+    yield _normalize_string(sysconfig.get_platform())
+def platform_tags() -> Iterator[str]:
+    """
+    Provides the platform tags for this installation.
+    """
+    if platform.system() == "Darwin":
+        return mac_platforms()
+    elif platform.system() == "Linux":
+        return _linux_platforms()
+    else:
+        return _generic_platforms()
+def interpreter_name() -> str:
+    """
+    Returns the name of the running interpreter.
+    Some implementations have a reserved, two-letter abbreviation which will
+    be returned when appropriate.
+    """
+    name = sys.implementation.name
+    return INTERPRETER_SHORT_NAMES.get(name) or name
+def interpreter_version(*, warn: bool = False) -> str:
+    """
+    Returns the version of the running interpreter.
+    """
+    version = _get_config_var("py_version_nodot", warn=warn)
+    if version:
+        version = str(version)
+    else:
+        version = _version_nodot(sys.version_info[:2])
+    return version
+def _version_nodot(version: PythonVersion) -> str:
+    return "".join(map(str, version))
+def sys_tags(*, warn: bool = False) -> Iterator[Tag]:
+    """
+    Returns the sequence of tag triples for the running interpreter.
+    The order of the sequence corresponds to priority order for the
+    interpreter, from most to least important.
+    """
+    interp_name = interpreter_name()
+    if interp_name == "cp":
+        yield from cpython_tags(warn=warn)
+    else:
+        yield from generic_tags()
+    if interp_name == "pp":
+        interp = "pp3"
+    elif interp_name == "cp":
+        interp = "cp" + interpreter_version(warn=warn)
+    else:
+        interp = None
+    yield from compatible_tags(interpreter=interp)

Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/utils.py ADDED Viewed

	@@ -0,0 +1,172 @@

+# This file is dual licensed under the terms of the Apache License, Version
+# 2.0, and the BSD License. See the LICENSE file in the root of this repository
+# for complete details.
+import re
+from typing import FrozenSet, NewType, Tuple, Union, cast
+from .tags import Tag, parse_tag
+from .version import InvalidVersion, Version
+BuildTag = Union[Tuple[()], Tuple[int, str]]
+NormalizedName = NewType("NormalizedName", str)
+class InvalidName(ValueError):
+    """
+    An invalid distribution name; users should refer to the packaging user guide.
+    """
+class InvalidWheelFilename(ValueError):
+    """
+    An invalid wheel filename was found, users should refer to PEP 427.
+    """
+class InvalidSdistFilename(ValueError):
+    """
+    An invalid sdist filename was found, users should refer to the packaging user guide.
+    """
+# Core metadata spec for `Name`
+_validate_regex = re.compile(
+    r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", re.IGNORECASE
+)
+_canonicalize_regex = re.compile(r"[-_.]+")
+_normalized_regex = re.compile(r"^([a-z0-9]|[a-z0-9]([a-z0-9-](?!--))*[a-z0-9])$")
+# PEP 427: The build number must start with a digit.
+_build_tag_regex = re.compile(r"(\d+)(.*)")
+def canonicalize_name(name: str, *, validate: bool = False) -> NormalizedName:
+    if validate and not _validate_regex.match(name):
+        raise InvalidName(f"name is invalid: {name!r}")
+    # This is taken from PEP 503.
+    value = _canonicalize_regex.sub("-", name).lower()
+    return cast(NormalizedName, value)
+def is_normalized_name(name: str) -> bool:
+    return _normalized_regex.match(name) is not None
+def canonicalize_version(
+    version: Union[Version, str], *, strip_trailing_zero: bool = True
+) -> str:
+    """
+    This is very similar to Version.__str__, but has one subtle difference
+    with the way it handles the release segment.
+    """
+    if isinstance(version, str):
+        try:
+            parsed = Version(version)
+        except InvalidVersion:
+            # Legacy versions cannot be normalized
+            return version
+    else:
+        parsed = version
+    parts = []
+    # Epoch
+    if parsed.epoch != 0:
+        parts.append(f"{parsed.epoch}!")
+    # Release segment
+    release_segment = ".".join(str(x) for x in parsed.release)
+    if strip_trailing_zero:
+        # NB: This strips trailing '.0's to normalize
+        release_segment = re.sub(r"(\.0)+$", "", release_segment)
+    parts.append(release_segment)
+    # Pre-release
+    if parsed.pre is not None:
+        parts.append("".join(str(x) for x in parsed.pre))
+    # Post-release
+    if parsed.post is not None:
+        parts.append(f".post{parsed.post}")
+    # Development release
+    if parsed.dev is not None:
+        parts.append(f".dev{parsed.dev}")
+    # Local version segment
+    if parsed.local is not None:
+        parts.append(f"+{parsed.local}")
+    return "".join(parts)
+def parse_wheel_filename(
+    filename: str,
+) -> Tuple[NormalizedName, Version, BuildTag, FrozenSet[Tag]]:
+    if not filename.endswith(".whl"):
+        raise InvalidWheelFilename(
+            f"Invalid wheel filename (extension must be '.whl'): {filename}"
+        )
+    filename = filename[:-4]
+    dashes = filename.count("-")
+    if dashes not in (4, 5):
+        raise InvalidWheelFilename(
+            f"Invalid wheel filename (wrong number of parts): {filename}"
+        )
+    parts = filename.split("-", dashes - 2)
+    name_part = parts[0]
+    # See PEP 427 for the rules on escaping the project name.
+    if "__" in name_part or re.match(r"^[\w\d._]*$", name_part, re.UNICODE) is None:
+        raise InvalidWheelFilename(f"Invalid project name: {filename}")
+    name = canonicalize_name(name_part)
+    try:
+        version = Version(parts[1])
+    except InvalidVersion as e:
+        raise InvalidWheelFilename(
+            f"Invalid wheel filename (invalid version): {filename}"
+        ) from e
+    if dashes == 5:
+        build_part = parts[2]
+        build_match = _build_tag_regex.match(build_part)
+        if build_match is None:
+            raise InvalidWheelFilename(
+                f"Invalid build number: {build_part} in '{filename}'"
+            )
+        build = cast(BuildTag, (int(build_match.group(1)), build_match.group(2)))
+    else:
+        build = ()
+    tags = parse_tag(parts[-1])
+    return (name, version, build, tags)
+def parse_sdist_filename(filename: str) -> Tuple[NormalizedName, Version]:
+    if filename.endswith(".tar.gz"):
+        file_stem = filename[: -len(".tar.gz")]
+    elif filename.endswith(".zip"):
+        file_stem = filename[: -len(".zip")]
+    else:
+        raise InvalidSdistFilename(
+            f"Invalid sdist filename (extension must be '.tar.gz' or '.zip'):"
+            f" {filename}"
+        )
+    # We are requiring a PEP 440 version, which cannot contain dashes,
+    # so we split on the last dash.
+    name_part, sep, version_part = file_stem.rpartition("-")
+    if not sep:
+        raise InvalidSdistFilename(f"Invalid sdist filename: {filename}")
+    name = canonicalize_name(name_part)
+    try:
+        version = Version(version_part)
+    except InvalidVersion as e:
+        raise InvalidSdistFilename(
+            f"Invalid sdist filename (invalid version): {filename}"
+        ) from e
+    return (name, version)