BryanW commited on
Commit
76cbda0
·
verified ·
1 Parent(s): 6954e2b

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/common_functions.h +65 -0
  2. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.h +0 -0
  3. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.hpp +0 -0
  4. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_egl_interop.h +645 -0
  5. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp16.h +0 -0
  6. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.h +475 -0
  7. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_occupancy.h +2094 -0
  8. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline_primitives.h +148 -0
  9. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti.h +123 -0
  10. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_events.h +1349 -0
  11. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pcsampling.h +936 -0
  12. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_runtime_cbid.h +504 -0
  13. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_functions.h +65 -0
  14. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/library_types.h +111 -0
  15. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_cuda_host.h +179 -0
  16. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.hpp +588 -0
  17. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.h +330 -0
  18. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.hpp +161 -0
  19. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/__init__.cpython-312.pyc +0 -0
  20. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/convert.cpython-312.pyc +0 -0
  21. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/pack.cpython-312.pyc +0 -0
  22. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/tags.cpython-312.pyc +0 -0
  23. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/unpack.cpython-312.pyc +0 -0
  24. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/__pycache__/__init__.cpython-312.pyc +0 -0
  25. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/LICENSE +3 -0
  26. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/LICENSE.APACHE +177 -0
  27. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/LICENSE.BSD +23 -0
  28. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__init__.py +0 -0
  29. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/__init__.cpython-312.pyc +0 -0
  30. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_elffile.cpython-312.pyc +0 -0
  31. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_manylinux.cpython-312.pyc +0 -0
  32. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_musllinux.cpython-312.pyc +0 -0
  33. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_parser.cpython-312.pyc +0 -0
  34. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_structures.cpython-312.pyc +0 -0
  35. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_tokenizer.cpython-312.pyc +0 -0
  36. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/markers.cpython-312.pyc +0 -0
  37. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/requirements.cpython-312.pyc +0 -0
  38. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/specifiers.cpython-312.pyc +0 -0
  39. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/tags.cpython-312.pyc +0 -0
  40. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/utils.cpython-312.pyc +0 -0
  41. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/version.cpython-312.pyc +0 -0
  42. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_elffile.py +108 -0
  43. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_musllinux.py +83 -0
  44. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_parser.py +356 -0
  45. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_structures.py +61 -0
  46. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_tokenizer.py +192 -0
  47. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/markers.py +253 -0
  48. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/specifiers.py +1011 -0
  49. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/tags.py +571 -0
  50. Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/utils.py +172 -0
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/common_functions.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "common_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
58
+ #endif
59
+
60
+ #include "crt/common_functions.h"
61
+
62
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__)
63
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
64
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_COMMON_FUNCTIONS_H_WRAPPER__
65
+ #endif
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.h ADDED
The diff for this file is too large to render. See raw diff
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_bf16.hpp ADDED
The diff for this file is too large to render. See raw diff
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_egl_interop.h ADDED
@@ -0,0 +1,645 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_EGL_INTEROP_H__)
51
+ #define __CUDA_EGL_INTEROP_H__
52
+
53
+ #include "cuda_runtime_api.h"
54
+ #include "cuda_runtime.h"
55
+ #include "cudart_platform.h"
56
+ #include "EGL/egl.h"
57
+ #include "EGL/eglext.h"
58
+
59
+ #if defined(__cplusplus)
60
+ extern "C" {
61
+ #endif /* __cplusplus */
62
+
63
+ /**
64
+ * \addtogroup CUDART_TYPES
65
+ * @{
66
+ */
67
+
68
+ /**
69
+ * Maximum number of planes per frame
70
+ */
71
+ #define CUDA_EGL_MAX_PLANES 3
72
+
73
+ /**
74
+ * CUDA EglFrame type - array or pointer
75
+ */
76
+ typedef enum cudaEglFrameType_enum
77
+ {
78
+ cudaEglFrameTypeArray = 0, /**< Frame type CUDA array */
79
+ cudaEglFrameTypePitch = 1, /**< Frame type CUDA pointer */
80
+ } cudaEglFrameType;
81
+
82
+ /**
83
+ * Resource location flags- sysmem or vidmem
84
+ *
85
+ * For CUDA context on iGPU, since video and system memory are equivalent -
86
+ * these flags will not have an effect on the execution.
87
+ *
88
+ * For CUDA context on dGPU, applications can use the flag ::cudaEglResourceLocationFlags
89
+ * to give a hint about the desired location.
90
+ *
91
+ * ::cudaEglResourceLocationSysmem - the frame data is made resident on the system memory
92
+ * to be accessed by CUDA.
93
+ *
94
+ * ::cudaEglResourceLocationVidmem - the frame data is made resident on the dedicated
95
+ * video memory to be accessed by CUDA.
96
+ *
97
+ * There may be an additional latency due to new allocation and data migration,
98
+ * if the frame is produced on a different memory.
99
+ */
100
+ typedef enum cudaEglResourceLocationFlags_enum {
101
+ cudaEglResourceLocationSysmem = 0x00, /**< Resource location sysmem */
102
+ cudaEglResourceLocationVidmem = 0x01, /**< Resource location vidmem */
103
+ } cudaEglResourceLocationFlags;
104
+
105
+ /**
106
+ * CUDA EGL Color Format - The different planar and multiplanar formats currently supported for CUDA_EGL interops.
107
+ */
108
+ typedef enum cudaEglColorFormat_enum {
109
+ cudaEglColorFormatYUV420Planar = 0, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
110
+ cudaEglColorFormatYUV420SemiPlanar = 1, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV420Planar. */
111
+ cudaEglColorFormatYUV422Planar = 2, /**< Y, U, V each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
112
+ cudaEglColorFormatYUV422SemiPlanar = 3, /**< Y, UV in two surfaces with VU byte ordering, width, height ratio same as YUV422Planar. */
113
+ cudaEglColorFormatARGB = 6, /**< R/G/B/A four channels in one surface with BGRA byte ordering. */
114
+ cudaEglColorFormatRGBA = 7, /**< R/G/B/A four channels in one surface with ABGR byte ordering. */
115
+ cudaEglColorFormatL = 8, /**< single luminance channel in one surface. */
116
+ cudaEglColorFormatR = 9, /**< single color channel in one surface. */
117
+ cudaEglColorFormatYUV444Planar = 10, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
118
+ cudaEglColorFormatYUV444SemiPlanar = 11, /**< Y, UV in two surfaces (UV as one surface) with VU byte ordering, width, height ratio same as YUV444Planar. */
119
+ cudaEglColorFormatYUYV422 = 12, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
120
+ cudaEglColorFormatUYVY422 = 13, /**< Y, U, V in one surface, interleaved as YUYV in one channel. */
121
+ cudaEglColorFormatABGR = 14, /**< R/G/B/A four channels in one surface with RGBA byte ordering. */
122
+ cudaEglColorFormatBGRA = 15, /**< R/G/B/A four channels in one surface with ARGB byte ordering. */
123
+ cudaEglColorFormatA = 16, /**< Alpha color format - one channel in one surface. */
124
+ cudaEglColorFormatRG = 17, /**< R/G color format - two channels in one surface with GR byte ordering */
125
+ cudaEglColorFormatAYUV = 18, /**< Y, U, V, A four channels in one surface, interleaved as VUYA. */
126
+ cudaEglColorFormatYVU444SemiPlanar = 19, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
127
+ cudaEglColorFormatYVU422SemiPlanar = 20, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
128
+ cudaEglColorFormatYVU420SemiPlanar = 21, /**< Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
129
+ cudaEglColorFormatY10V10U10_444SemiPlanar = 22, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
130
+ cudaEglColorFormatY10V10U10_420SemiPlanar = 23, /**< Y10, V10U10 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
131
+ cudaEglColorFormatY12V12U12_444SemiPlanar = 24, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
132
+ cudaEglColorFormatY12V12U12_420SemiPlanar = 25, /**< Y12, V12U12 in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
133
+ cudaEglColorFormatVYUY_ER = 26, /**< Extended Range Y, U, V in one surface, interleaved as YVYU in one channel. */
134
+ cudaEglColorFormatUYVY_ER = 27, /**< Extended Range Y, U, V in one surface, interleaved as YUYV in one channel. */
135
+ cudaEglColorFormatYUYV_ER = 28, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
136
+ cudaEglColorFormatYVYU_ER = 29, /**< Extended Range Y, U, V in one surface, interleaved as VYUY in one channel. */
137
+ cudaEglColorFormatYUVA_ER = 31, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as AVUY. */
138
+ cudaEglColorFormatAYUV_ER = 32, /**< Extended Range Y, U, V, A four channels in one surface, interleaved as VUYA. */
139
+ cudaEglColorFormatYUV444Planar_ER = 33, /**< Extended Range Y, U, V in three surfaces, U/V width = Y width, U/V height = Y height. */
140
+ cudaEglColorFormatYUV422Planar_ER = 34, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
141
+ cudaEglColorFormatYUV420Planar_ER = 35, /**< Extended Range Y, U, V in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
142
+ cudaEglColorFormatYUV444SemiPlanar_ER = 36, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = Y width, U/V height = Y height. */
143
+ cudaEglColorFormatYUV422SemiPlanar_ER = 37, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
144
+ cudaEglColorFormatYUV420SemiPlanar_ER = 38, /**< Extended Range Y, UV in two surfaces (UV as one surface) with VU byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
145
+ cudaEglColorFormatYVU444Planar_ER = 39, /**< Extended Range Y, V, U in three surfaces, U/V width = Y width, U/V height = Y height. */
146
+ cudaEglColorFormatYVU422Planar_ER = 40, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = Y height. */
147
+ cudaEglColorFormatYVU420Planar_ER = 41, /**< Extended Range Y, V, U in three surfaces, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
148
+ cudaEglColorFormatYVU444SemiPlanar_ER = 42, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = Y width, U/V height = Y height. */
149
+ cudaEglColorFormatYVU422SemiPlanar_ER = 43, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = Y height. */
150
+ cudaEglColorFormatYVU420SemiPlanar_ER = 44, /**< Extended Range Y, VU in two surfaces (VU as one surface) with UV byte ordering, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
151
+ cudaEglColorFormatBayerRGGB = 45, /**< Bayer format - one channel in one surface with interleaved RGGB ordering. */
152
+ cudaEglColorFormatBayerBGGR = 46, /**< Bayer format - one channel in one surface with interleaved BGGR ordering. */
153
+ cudaEglColorFormatBayerGRBG = 47, /**< Bayer format - one channel in one surface with interleaved GRBG ordering. */
154
+ cudaEglColorFormatBayerGBRG = 48, /**< Bayer format - one channel in one surface with interleaved GBRG ordering. */
155
+ cudaEglColorFormatBayer10RGGB = 49, /**< Bayer10 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
156
+ cudaEglColorFormatBayer10BGGR = 50, /**< Bayer10 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
157
+ cudaEglColorFormatBayer10GRBG = 51, /**< Bayer10 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
158
+ cudaEglColorFormatBayer10GBRG = 52, /**< Bayer10 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
159
+ cudaEglColorFormatBayer12RGGB = 53, /**< Bayer12 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
160
+ cudaEglColorFormatBayer12BGGR = 54, /**< Bayer12 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
161
+ cudaEglColorFormatBayer12GRBG = 55, /**< Bayer12 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
162
+ cudaEglColorFormatBayer12GBRG = 56, /**< Bayer12 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
163
+ cudaEglColorFormatBayer14RGGB = 57, /**< Bayer14 format - one channel in one surface with interleaved RGGB ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
164
+ cudaEglColorFormatBayer14BGGR = 58, /**< Bayer14 format - one channel in one surface with interleaved BGGR ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
165
+ cudaEglColorFormatBayer14GRBG = 59, /**< Bayer14 format - one channel in one surface with interleaved GRBG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
166
+ cudaEglColorFormatBayer14GBRG = 60, /**< Bayer14 format - one channel in one surface with interleaved GBRG ordering. Out of 16 bits, 14 bits used 2 bits No-op. */
167
+ cudaEglColorFormatBayer20RGGB = 61, /**< Bayer20 format - one channel in one surface with interleaved RGGB ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
168
+ cudaEglColorFormatBayer20BGGR = 62, /**< Bayer20 format - one channel in one surface with interleaved BGGR ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
169
+ cudaEglColorFormatBayer20GRBG = 63, /**< Bayer20 format - one channel in one surface with interleaved GRBG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
170
+ cudaEglColorFormatBayer20GBRG = 64, /**< Bayer20 format - one channel in one surface with interleaved GBRG ordering. Out of 32 bits, 20 bits used 12 bits No-op. */
171
+ cudaEglColorFormatYVU444Planar = 65, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = Y width, U/V height = Y height. */
172
+ cudaEglColorFormatYVU422Planar = 66, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = Y height. */
173
+ cudaEglColorFormatYVU420Planar = 67, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
174
+ cudaEglColorFormatBayerIspRGGB = 68, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved RGGB ordering and mapped to opaque integer datatype. */
175
+ cudaEglColorFormatBayerIspBGGR = 69, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved BGGR ordering and mapped to opaque integer datatype. */
176
+ cudaEglColorFormatBayerIspGRBG = 70, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GRBG ordering and mapped to opaque integer datatype. */
177
+ cudaEglColorFormatBayerIspGBRG = 71, /**< Nvidia proprietary Bayer ISP format - one channel in one surface with interleaved GBRG ordering and mapped to opaque integer datatype. */
178
+ cudaEglColorFormatBayerBCCR = 72, /**< Bayer format - one channel in one surface with interleaved BCCR ordering. */
179
+ cudaEglColorFormatBayerRCCB = 73, /**< Bayer format - one channel in one surface with interleaved RCCB ordering. */
180
+ cudaEglColorFormatBayerCRBC = 74, /**< Bayer format - one channel in one surface with interleaved CRBC ordering. */
181
+ cudaEglColorFormatBayerCBRC = 75, /**< Bayer format - one channel in one surface with interleaved CBRC ordering. */
182
+ cudaEglColorFormatBayer10CCCC = 76, /**< Bayer10 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 10 bits used 6 bits No-op. */
183
+ cudaEglColorFormatBayer12BCCR = 77, /**< Bayer12 format - one channel in one surface with interleaved BCCR ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
184
+ cudaEglColorFormatBayer12RCCB = 78, /**< Bayer12 format - one channel in one surface with interleaved RCCB ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
185
+ cudaEglColorFormatBayer12CRBC = 79, /**< Bayer12 format - one channel in one surface with interleaved CRBC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
186
+ cudaEglColorFormatBayer12CBRC = 80, /**< Bayer12 format - one channel in one surface with interleaved CBRC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
187
+ cudaEglColorFormatBayer12CCCC = 81, /**< Bayer12 format - one channel in one surface with interleaved CCCC ordering. Out of 16 bits, 12 bits used 4 bits No-op. */
188
+ cudaEglColorFormatY = 82, /**< Color format for single Y plane. */
189
+ cudaEglColorFormatYUV420SemiPlanar_2020 = 83, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
190
+ cudaEglColorFormatYVU420SemiPlanar_2020 = 84, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
191
+ cudaEglColorFormatYUV420Planar_2020 = 85, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
192
+ cudaEglColorFormatYVU420Planar_2020 = 86, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
193
+ cudaEglColorFormatYUV420SemiPlanar_709 = 87, /**< Y, UV in two surfaces (UV as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
194
+ cudaEglColorFormatYVU420SemiPlanar_709 = 88, /**< Y, VU in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
195
+ cudaEglColorFormatYUV420Planar_709 = 89, /**< Y, U, V in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
196
+ cudaEglColorFormatYVU420Planar_709 = 90, /**< Y, V, U in three surfaces, each in a separate surface, U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
197
+ cudaEglColorFormatY10V10U10_420SemiPlanar_709 = 91, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
198
+ cudaEglColorFormatY10V10U10_420SemiPlanar_2020 = 92, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
199
+ cudaEglColorFormatY10V10U10_422SemiPlanar_2020 = 93, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
200
+ cudaEglColorFormatY10V10U10_422SemiPlanar = 94, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
201
+ cudaEglColorFormatY10V10U10_422SemiPlanar_709 = 95, /**< Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = Y height. */
202
+ cudaEglColorFormatY_ER = 96, /**< Extended Range Color format for single Y plane. */
203
+ cudaEglColorFormatY_709_ER = 97, /**< Extended Range Color format for single Y plane. */
204
+ cudaEglColorFormatY10_ER = 98, /**< Extended Range Color format for single Y10 plane. */
205
+ cudaEglColorFormatY10_709_ER = 99, /**< Extended Range Color format for single Y10 plane. */
206
+ cudaEglColorFormatY12_ER = 100, /**< Extended Range Color format for single Y12 plane. */
207
+ cudaEglColorFormatY12_709_ER = 101, /**< Extended Range Color format for single Y12 plane. */
208
+ cudaEglColorFormatYUVA = 102, /**< Y, U, V, A four channels in one surface, interleaved as AVUY. */
209
+ cudaEglColorFormatYVYU = 104, /**< Y, U, V in one surface, interleaved as YVYU in one channel. */
210
+ cudaEglColorFormatVYUY = 105, /**< Y, U, V in one surface, interleaved as VYUY in one channel. */
211
+ cudaEglColorFormatY10V10U10_420SemiPlanar_ER = 106, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
212
+ cudaEglColorFormatY10V10U10_420SemiPlanar_709_ER = 107, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
213
+ cudaEglColorFormatY10V10U10_444SemiPlanar_ER = 108, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
214
+ cudaEglColorFormatY10V10U10_444SemiPlanar_709_ER = 109, /**< Extended Range Y10, V10U10 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
215
+ cudaEglColorFormatY12V12U12_420SemiPlanar_ER = 110, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
216
+ cudaEglColorFormatY12V12U12_420SemiPlanar_709_ER = 111, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = 1/2 Y width, U/V height = 1/2 Y height. */
217
+ cudaEglColorFormatY12V12U12_444SemiPlanar_ER = 112, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
218
+ cudaEglColorFormatY12V12U12_444SemiPlanar_709_ER = 113, /**< Extended Range Y12, V12U12 in two surfaces (VU as one surface) U/V width = Y width, U/V height = Y height. */
219
+ cudaEglColorFormatUYVY709 = 114, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
220
+ cudaEglColorFormatUYVY709_ER = 115, /**< Extended Range Y, U, V in one surface, interleaved as UYVY in one channel. */
221
+ cudaEglColorFormatUYVY2020 = 116, /**< Y, U, V in one surface, interleaved as UYVY in one channel. */
222
+ } cudaEglColorFormat;
223
+
224
+ /**
225
+ * CUDA EGL Plane Descriptor - structure defining each plane of a CUDA EGLFrame
226
+ */
227
+ typedef struct cudaEglPlaneDesc_st {
228
+ unsigned int width; /**< Width of plane */
229
+ unsigned int height; /**< Height of plane */
230
+ unsigned int depth; /**< Depth of plane */
231
+ unsigned int pitch; /**< Pitch of plane */
232
+ unsigned int numChannels; /**< Number of channels for the plane */
233
+ struct cudaChannelFormatDesc channelDesc; /**< Channel Format Descriptor */
234
+ unsigned int reserved[4]; /**< Reserved for future use */
235
+ } cudaEglPlaneDesc;
236
+
237
+ /**
238
+ * CUDA EGLFrame Descriptor - structure defining one frame of EGL.
239
+ *
240
+ * Each frame may contain one or more planes depending on whether the surface is Multiplanar or not.
241
+ * Each plane of EGLFrame is represented by ::cudaEglPlaneDesc which is defined as:
242
+ * \code
243
+ * typedef struct cudaEglPlaneDesc_st {
244
+ * unsigned int width;
245
+ * unsigned int height;
246
+ * unsigned int depth;
247
+ * unsigned int pitch;
248
+ * unsigned int numChannels;
249
+ * struct cudaChannelFormatDesc channelDesc;
250
+ * unsigned int reserved[4];
251
+ * } cudaEglPlaneDesc;
252
+ * \endcode
253
+
254
+ */
255
+ typedef struct cudaEglFrame_st {
256
+ union {
257
+ cudaArray_t pArray[CUDA_EGL_MAX_PLANES]; /**< Array of CUDA arrays corresponding to each plane*/
258
+ struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES]; /**< Array of Pointers corresponding to each plane*/
259
+ } frame;
260
+ cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES]; /**< CUDA EGL Plane Descriptor ::cudaEglPlaneDesc*/
261
+ unsigned int planeCount; /**< Number of planes */
262
+ cudaEglFrameType frameType; /**< Array or Pitch */
263
+ cudaEglColorFormat eglColorFormat; /**< CUDA EGL Color Format*/
264
+ } cudaEglFrame;
265
+
266
+ /**
267
+ * CUDA EGLSream Connection
268
+ */
269
+ typedef struct CUeglStreamConnection_st *cudaEglStreamConnection;
270
+
271
+ /** @} */ /* END CUDART_TYPES */
272
+
273
+ /**
274
+ * \addtogroup CUDART_EGL EGL Interoperability
275
+ * This section describes the EGL interoperability functions of the CUDA
276
+ * runtime application programming interface.
277
+ *
278
+ * @{
279
+ */
280
+
281
+ /**
282
+ * \brief Registers an EGL image
283
+ *
284
+ * Registers the EGLImageKHR specified by \p image for access by
285
+ * CUDA. A handle to the registered object is returned as \p pCudaResource.
286
+ * Additional Mapping/Unmapping is not required for the registered resource and
287
+ * ::cudaGraphicsResourceGetMappedEglFrame can be directly called on the \p pCudaResource.
288
+ *
289
+ * The application will be responsible for synchronizing access to shared objects.
290
+ * The application must ensure that any pending operation which access the objects have completed
291
+ * before passing control to CUDA. This may be accomplished by issuing and waiting for
292
+ * glFinish command on all GLcontexts (for OpenGL and likewise for other APIs).
293
+ * The application will be also responsible for ensuring that any pending operation on the
294
+ * registered CUDA resource has completed prior to executing subsequent commands in other APIs
295
+ * accesing the same memory objects.
296
+ * This can be accomplished by calling cuCtxSynchronize or cuEventSynchronize (preferably).
297
+ *
298
+ * The surface's intended usage is specified using \p flags, as follows:
299
+ *
300
+ * - ::cudaGraphicsRegisterFlagsNone: Specifies no hints about how this
301
+ * resource will be used. It is therefore assumed that this resource will be
302
+ * read from and written to by CUDA. This is the default value.
303
+ * - ::cudaGraphicsRegisterFlagsReadOnly: Specifies that CUDA
304
+ * will not write to this resource.
305
+ * - ::cudaGraphicsRegisterFlagsWriteDiscard: Specifies that
306
+ * CUDA will not read from this resource and will write over the
307
+ * entire contents of the resource, so none of the data previously
308
+ * stored in the resource will be preserved.
309
+ *
310
+ * The EGLImageKHR is an object which can be used to create EGLImage target resource. It is defined as a void pointer.
311
+ * typedef void* EGLImageKHR
312
+ *
313
+ * \param pCudaResource - Pointer to the returned object handle
314
+ * \param image - An EGLImageKHR image which can be used to create target resource.
315
+ * \param flags - Map flags
316
+ *
317
+ * \return
318
+ * ::cudaSuccess,
319
+ * ::cudaErrorInvalidResourceHandle,
320
+ * ::cudaErrorInvalidValue,
321
+ * ::cudaErrorUnknown
322
+ *
323
+ * \sa
324
+ * ::cudaGraphicsUnregisterResource,
325
+ * ::cudaGraphicsResourceGetMappedEglFrame,
326
+ * ::cuGraphicsEGLRegisterImage
327
+ */
328
+ extern __host__ cudaError_t CUDARTAPI cudaGraphicsEGLRegisterImage(struct cudaGraphicsResource **pCudaResource, EGLImageKHR image, unsigned int flags);
329
+
330
+ /**
331
+ * \brief Connect CUDA to EGLStream as a consumer.
332
+ *
333
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p eglStream.
334
+ *
335
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
336
+ * API to another.
337
+ *
338
+ * \param conn - Pointer to the returned connection handle
339
+ * \param eglStream - EGLStreamKHR handle
340
+ *
341
+ * \return
342
+ * ::cudaSuccess,
343
+ * ::cudaErrorInvalidValue,
344
+ * ::cudaErrorUnknown
345
+ *
346
+ * \sa
347
+ * ::cudaEGLStreamConsumerDisconnect,
348
+ * ::cudaEGLStreamConsumerAcquireFrame,
349
+ * ::cudaEGLStreamConsumerReleaseFrame,
350
+ * ::cuEGLStreamConsumerConnect
351
+ */
352
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnect(cudaEglStreamConnection *conn, EGLStreamKHR eglStream);
353
+
354
+ /**
355
+ * \brief Connect CUDA to EGLStream as a consumer with given flags.
356
+ *
357
+ * Connect CUDA as a consumer to EGLStreamKHR specified by \p stream with specified \p flags defined by
358
+ * ::cudaEglResourceLocationFlags.
359
+ *
360
+ * The flags specify whether the consumer wants to access frames from system memory or video memory.
361
+ * Default is ::cudaEglResourceLocationVidmem.
362
+ *
363
+ * \param conn - Pointer to the returned connection handle
364
+ * \param eglStream - EGLStreamKHR handle
365
+ * \param flags - Flags denote intended location - system or video.
366
+ *
367
+ * \return
368
+ * ::cudaSuccess,
369
+ * ::cudaErrorInvalidValue,
370
+ * ::cudaErrorUnknown
371
+ *
372
+ * \sa
373
+ * ::cudaEGLStreamConsumerDisconnect,
374
+ * ::cudaEGLStreamConsumerAcquireFrame,
375
+ * ::cudaEGLStreamConsumerReleaseFrame,
376
+ * ::cuEGLStreamConsumerConnectWithFlags
377
+ */
378
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerConnectWithFlags(cudaEglStreamConnection *conn, EGLStreamKHR eglStream, unsigned int flags);
379
+
380
+ /**
381
+ * \brief Disconnect CUDA as a consumer to EGLStream .
382
+ *
383
+ * Disconnect CUDA as a consumer to EGLStreamKHR.
384
+ *
385
+ * \param conn - Conection to disconnect.
386
+ *
387
+ * \return
388
+ * ::cudaSuccess,
389
+ * ::cudaErrorInvalidValue,
390
+ * ::cudaErrorUnknown
391
+ *
392
+ * \sa
393
+ * ::cudaEGLStreamConsumerConnect,
394
+ * ::cudaEGLStreamConsumerAcquireFrame,
395
+ * ::cudaEGLStreamConsumerReleaseFrame,
396
+ * ::cuEGLStreamConsumerDisconnect
397
+ */
398
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerDisconnect(cudaEglStreamConnection *conn);
399
+
400
+ /**
401
+ * \brief Acquire an image frame from the EGLStream with CUDA as a consumer.
402
+ *
403
+ * Acquire an image frame from EGLStreamKHR.
404
+ * ::cudaGraphicsResourceGetMappedEglFrame can be called on \p pCudaResource to get
405
+ * ::cudaEglFrame.
406
+ *
407
+ * \param conn - Connection on which to acquire
408
+ * \param pCudaResource - CUDA resource on which the EGLStream frame will be mapped for use.
409
+ * \param pStream - CUDA stream for synchronization and any data migrations
410
+ * implied by ::cudaEglResourceLocationFlags.
411
+ * \param timeout - Desired timeout in usec.
412
+ *
413
+ * \return
414
+ * ::cudaSuccess,
415
+ * ::cudaErrorInvalidValue,
416
+ * ::cudaErrorUnknown,
417
+ * ::cudaErrorLaunchTimeout
418
+ *
419
+ * \sa
420
+ * ::cudaEGLStreamConsumerConnect,
421
+ * ::cudaEGLStreamConsumerDisconnect,
422
+ * ::cudaEGLStreamConsumerReleaseFrame,
423
+ * ::cuEGLStreamConsumerAcquireFrame
424
+ */
425
+
426
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerAcquireFrame(cudaEglStreamConnection *conn,
427
+ cudaGraphicsResource_t *pCudaResource, cudaStream_t *pStream, unsigned int timeout);
428
+ /**
429
+ * \brief Releases the last frame acquired from the EGLStream.
430
+ *
431
+ * Release the acquired image frame specified by \p pCudaResource to EGLStreamKHR.
432
+ *
433
+ * \param conn - Connection on which to release
434
+ * \param pCudaResource - CUDA resource whose corresponding frame is to be released
435
+ * \param pStream - CUDA stream on which release will be done.
436
+ *
437
+ * \return
438
+ * ::cudaSuccess,
439
+ * ::cudaErrorInvalidValue,
440
+ * ::cudaErrorUnknown
441
+ *
442
+ * \sa
443
+ * ::cudaEGLStreamConsumerConnect,
444
+ * ::cudaEGLStreamConsumerDisconnect,
445
+ * ::cudaEGLStreamConsumerAcquireFrame,
446
+ * ::cuEGLStreamConsumerReleaseFrame
447
+ */
448
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamConsumerReleaseFrame(cudaEglStreamConnection *conn,
449
+ cudaGraphicsResource_t pCudaResource, cudaStream_t *pStream);
450
+
451
+ /**
452
+ * \brief Connect CUDA to EGLStream as a producer.
453
+ *
454
+ * Connect CUDA as a producer to EGLStreamKHR specified by \p stream.
455
+ *
456
+ * The EGLStreamKHR is an EGL object that transfers a sequence of image frames from one
457
+ * API to another.
458
+ *
459
+ * \param conn - Pointer to the returned connection handle
460
+ * \param eglStream - EGLStreamKHR handle
461
+ * \param width - width of the image to be submitted to the stream
462
+ * \param height - height of the image to be submitted to the stream
463
+ *
464
+ * \return
465
+ * ::cudaSuccess,
466
+ * ::cudaErrorInvalidValue,
467
+ * ::cudaErrorUnknown
468
+ *
469
+ * \sa
470
+ * ::cudaEGLStreamProducerDisconnect,
471
+ * ::cudaEGLStreamProducerPresentFrame,
472
+ * ::cudaEGLStreamProducerReturnFrame,
473
+ * ::cuEGLStreamProducerConnect
474
+ */
475
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerConnect(cudaEglStreamConnection *conn,
476
+ EGLStreamKHR eglStream, EGLint width, EGLint height);
477
+
478
+ /**
479
+ * \brief Disconnect CUDA as a producer to EGLStream .
480
+ *
481
+ * Disconnect CUDA as a producer to EGLStreamKHR.
482
+ *
483
+ * \param conn - Conection to disconnect.
484
+ *
485
+ * \return
486
+ * ::cudaSuccess,
487
+ * ::cudaErrorInvalidValue,
488
+ * ::cudaErrorUnknown
489
+ *
490
+ * \sa
491
+ * ::cudaEGLStreamProducerConnect,
492
+ * ::cudaEGLStreamProducerPresentFrame,
493
+ * ::cudaEGLStreamProducerReturnFrame,
494
+ * ::cuEGLStreamProducerDisconnect
495
+ */
496
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerDisconnect(cudaEglStreamConnection *conn);
497
+
498
+ /**
499
+ * \brief Present a CUDA eglFrame to the EGLStream with CUDA as a producer.
500
+ *
501
+ * The ::cudaEglFrame is defined as:
502
+ * \code
503
+ * typedef struct cudaEglFrame_st {
504
+ * union {
505
+ * cudaArray_t pArray[CUDA_EGL_MAX_PLANES];
506
+ * struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES];
507
+ * } frame;
508
+ * cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
509
+ * unsigned int planeCount;
510
+ * cudaEglFrameType frameType;
511
+ * cudaEglColorFormat eglColorFormat;
512
+ * } cudaEglFrame;
513
+ * \endcode
514
+ *
515
+ * For ::cudaEglFrame of type ::cudaEglFrameTypePitch, the application may present sub-region of a memory
516
+ * allocation. In that case, ::cudaPitchedPtr::ptr will specify the start address of the sub-region in
517
+ * the allocation and ::cudaEglPlaneDesc will specify the dimensions of the sub-region.
518
+ *
519
+ * \param conn - Connection on which to present the CUDA array
520
+ * \param eglframe - CUDA Eglstream Proucer Frame handle to be sent to the consumer over EglStream.
521
+ * \param pStream - CUDA stream on which to present the frame.
522
+ *
523
+ * \return
524
+ * ::cudaSuccess,
525
+ * ::cudaErrorInvalidValue,
526
+ * ::cudaErrorUnknown
527
+ *
528
+ * \sa
529
+ * ::cudaEGLStreamProducerConnect,
530
+ * ::cudaEGLStreamProducerDisconnect,
531
+ * ::cudaEGLStreamProducerReturnFrame,
532
+ * ::cuEGLStreamProducerPresentFrame
533
+ */
534
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerPresentFrame(cudaEglStreamConnection *conn,
535
+ cudaEglFrame eglframe, cudaStream_t *pStream);
536
+
537
+ /**
538
+ * \brief Return the CUDA eglFrame to the EGLStream last released by the consumer.
539
+ *
540
+ * This API can potentially return cudaErrorLaunchTimeout if the consumer has not
541
+ * returned a frame to EGL stream. If timeout is returned the application can retry.
542
+ *
543
+ * \param conn - Connection on which to present the CUDA array
544
+ * \param eglframe - CUDA Eglstream Proucer Frame handle returned from the consumer over EglStream.
545
+ * \param pStream - CUDA stream on which to return the frame.
546
+ *
547
+ * \return
548
+ * ::cudaSuccess,
549
+ * ::cudaErrorLaunchTimeout,
550
+ * ::cudaErrorInvalidValue,
551
+ * ::cudaErrorUnknown
552
+ *
553
+ * \sa
554
+ * ::cudaEGLStreamProducerConnect,
555
+ * ::cudaEGLStreamProducerDisconnect,
556
+ * ::cudaEGLStreamProducerPresentFrame,
557
+ * ::cuEGLStreamProducerReturnFrame
558
+ */
559
+ extern __host__ cudaError_t CUDARTAPI cudaEGLStreamProducerReturnFrame(cudaEglStreamConnection *conn,
560
+ cudaEglFrame *eglframe, cudaStream_t *pStream);
561
+
562
+ /**
563
+ * \brief Get an eglFrame through which to access a registered EGL graphics resource.
564
+ *
565
+ * Returns in \p *eglFrame an eglFrame pointer through which the registered graphics resource
566
+ * \p resource may be accessed.
567
+ * This API can only be called for EGL graphics resources.
568
+ *
569
+ * The ::cudaEglFrame is defined as
570
+ * \code
571
+ * typedef struct cudaEglFrame_st {
572
+ * union {
573
+ * cudaArray_t pArray[CUDA_EGL_MAX_PLANES];
574
+ * struct cudaPitchedPtr pPitch[CUDA_EGL_MAX_PLANES];
575
+ * } frame;
576
+ * cudaEglPlaneDesc planeDesc[CUDA_EGL_MAX_PLANES];
577
+ * unsigned int planeCount;
578
+ * cudaEglFrameType frameType;
579
+ * cudaEglColorFormat eglColorFormat;
580
+ * } cudaEglFrame;
581
+ * \endcode
582
+ *
583
+ *
584
+ * \param eglFrame - Returned eglFrame.
585
+ * \param resource - Registered resource to access.
586
+ * \param index - Index for cubemap surfaces.
587
+ * \param mipLevel - Mipmap level for the subresource to access.
588
+ *
589
+ * \return
590
+ * ::cudaSuccess,
591
+ * ::cudaErrorInvalidValue,
592
+ * ::cudaErrorUnknown
593
+ *
594
+ * \note Note that in case of multiplanar \p *eglFrame, pitch of only first plane (unsigned int cudaEglPlaneDesc::pitch) is to be considered by the application.
595
+ *
596
+ * \sa
597
+ * ::cudaGraphicsSubResourceGetMappedArray,
598
+ * ::cudaGraphicsResourceGetMappedPointer,
599
+ * ::cuGraphicsResourceGetMappedEglFrame
600
+ */
601
+ extern __host__ cudaError_t CUDARTAPI cudaGraphicsResourceGetMappedEglFrame(cudaEglFrame* eglFrame,
602
+ cudaGraphicsResource_t resource, unsigned int index, unsigned int mipLevel);
603
+
604
+ /**
605
+ * \brief Creates an event from EGLSync object
606
+ *
607
+ * Creates an event *phEvent from an EGLSyncKHR eglSync with the flages specified
608
+ * via \p flags. Valid flags include:
609
+ * - ::cudaEventDefault: Default event creation flag.
610
+ * - ::cudaEventBlockingSync: Specifies that the created event should use blocking
611
+ * synchronization. A CPU thread that uses ::cudaEventSynchronize() to wait on
612
+ * an event created with this flag will block until the event has actually
613
+ * been completed.
614
+ *
615
+ * ::cudaEventRecord and TimingData are not supported for events created from EGLSync.
616
+ *
617
+ * The EGLSyncKHR is an opaque handle to an EGL sync object.
618
+ * typedef void* EGLSyncKHR
619
+ *
620
+ * \param phEvent - Returns newly created event
621
+ * \param eglSync - Opaque handle to EGLSync object
622
+ * \param flags - Event creation flags
623
+ *
624
+ * \return
625
+ * ::cudaSuccess,
626
+ * ::cudaErrorInitializationError,
627
+ * ::cudaErrorInvalidValue,
628
+ * ::cudaErrorLaunchFailure,
629
+ * ::cudaErrorMemoryAllocation
630
+ *
631
+ * \sa
632
+ * ::cudaEventQuery,
633
+ * ::cudaEventSynchronize,
634
+ * ::cudaEventDestroy
635
+ */
636
+ extern __host__ cudaError_t CUDARTAPI cudaEventCreateFromEGLSync(cudaEvent_t *phEvent, EGLSyncKHR eglSync, unsigned int flags);
637
+
638
+ /** @} */ /* END CUDART_EGL */
639
+
640
+ #if defined(__cplusplus)
641
+ }
642
+ #endif /* __cplusplus */
643
+
644
+ #endif /* __CUDA_EGL_INTEROP_H__ */
645
+
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp16.h ADDED
The diff for this file is too large to render. See raw diff
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_fp8.h ADDED
@@ -0,0 +1,475 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2022-2024 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef __CUDA_FP8_H__
51
+ #define __CUDA_FP8_H__
52
+
53
+ /* Set up function decorations */
54
+ #if defined(__CUDACC__)
55
+ #define __CUDA_FP8_DECL__ static __device__ __inline__
56
+ #define __CUDA_HOSTDEVICE_FP8__ __host__ __device__
57
+ #define __CUDA_HOSTDEVICE_FP8_DECL__ static __host__ __device__ __inline__
58
+ #else /* !defined(__CUDACC__) */
59
+ #if defined(__GNUC__)
60
+ #define __CUDA_HOSTDEVICE_FP8_DECL__ static __attribute__((unused))
61
+ #else
62
+ #define __CUDA_HOSTDEVICE_FP8_DECL__ static
63
+ #endif /* defined(__GNUC__) */
64
+ #define __CUDA_HOSTDEVICE_FP8__
65
+ #endif /* defined(__CUDACC_) */
66
+
67
+ #if !defined(_MSC_VER) && __cplusplus >= 201103L
68
+ #define __CPP_VERSION_AT_LEAST_11_FP8
69
+ #elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
70
+ #define __CPP_VERSION_AT_LEAST_11_FP8
71
+ #endif
72
+
73
+ // implicitly provided by NVRTC
74
+ #if !defined(__CUDACC_RTC__)
75
+ /* bring in enum cudaRoundMode */
76
+ #include "device_types.h"
77
+ #endif /* !defined(__CUDACC_RTC__) */
78
+
79
+ /* bring in __half_raw data type */
80
+ #include "cuda_fp16.h"
81
+ /* bring in __nv_bfloat16_raw data type */
82
+ #include "cuda_bf16.h"
83
+
84
+ // implicitly provided by NVRTC
85
+ #if !defined(__CUDACC_RTC__)
86
+ /* bring in float2, double4, etc vector types */
87
+ #include "vector_types.h"
88
+ #endif /* !defined(__CUDACC_RTC__) */
89
+
90
+ /**
91
+ * \defgroup CUDA_MATH_INTRINSIC_FP8 FP8 Intrinsics
92
+ * This section describes fp8 intrinsic functions.
93
+ * To use these functions, include the header file \p cuda_fp8.h in your
94
+ * program.
95
+ * The following macros are available to help users selectively enable/disable
96
+ * various definitions present in the header file:
97
+ * - \p __CUDA_NO_FP8_CONVERSIONS__ - If defined, this macro will prevent any
98
+ * use of the C++ type conversions (converting constructors and conversion
99
+ * operators) defined in the header.
100
+ * - \p __CUDA_NO_FP8_CONVERSION_OPERATORS__ - If defined, this macro will
101
+ * prevent any use of the C++ conversion operators from \p fp8 to other types.
102
+ */
103
+
104
+ /**
105
+ * \defgroup CUDA_MATH_FP8_MISC FP8 Conversion and Data Movement
106
+ * \ingroup CUDA_MATH_INTRINSIC_FP8
107
+ * To use these functions, include the header file \p cuda_fp8.h in your
108
+ * program.
109
+ */
110
+
111
+ /**
112
+ * \ingroup CUDA_MATH_FP8_MISC
113
+ * \brief 8-bit \p unsigned \p integer
114
+ * type abstraction used for \p fp8 floating-point
115
+ * numbers storage.
116
+ */
117
+ typedef unsigned char __nv_fp8_storage_t;
118
+
119
+ /**
120
+ * \ingroup CUDA_MATH_FP8_MISC
121
+ * \brief 16-bit \p unsigned \p integer
122
+ * type abstraction used for storage of pairs of
123
+ * \p fp8 floating-point numbers.
124
+ */
125
+ typedef unsigned short int __nv_fp8x2_storage_t;
126
+
127
+ /**
128
+ * \ingroup CUDA_MATH_FP8_MISC
129
+ * \brief 32-bit \p unsigned \p integer
130
+ * type abstraction used for storage of tetrads of
131
+ * \p fp8 floating-point numbers.
132
+ */
133
+ typedef unsigned int __nv_fp8x4_storage_t;
134
+
135
+ /**
136
+ * \ingroup CUDA_MATH_FP8_MISC
137
+ * \brief Enumerates the modes applicable when
138
+ * performing a narrowing conversion to \p fp8 destination types.
139
+ */
140
+ typedef enum __nv_saturation_t {
141
+ /**
142
+ * Means no saturation to finite is performed when conversion
143
+ * results in rounding values outside the range of destination
144
+ * type.
145
+ * NOTE: for fp8 type of e4m3 kind, the results that are larger
146
+ * than the maximum representable finite number of the target
147
+ * format become NaN.
148
+ */
149
+ __NV_NOSAT,
150
+ /**
151
+ * Means input larger than the maximum representable
152
+ * finite number MAXNORM of the target format round to the
153
+ * MAXNORM of the same sign as input.
154
+ */
155
+ __NV_SATFINITE,
156
+ } __nv_saturation_t;
157
+
158
+ /**
159
+ * \ingroup CUDA_MATH_FP8_MISC
160
+ * \brief Enumerates the possible
161
+ * interpretations of the 8-bit values when referring to them as
162
+ * \p fp8 types.
163
+ */
164
+ typedef enum __nv_fp8_interpretation_t {
165
+ __NV_E4M3, /**< Stands for \p fp8 numbers of \p e4m3 kind. */
166
+ __NV_E5M2, /**< Stands for \p fp8 numbers of \p e5m2 kind. */
167
+ } __nv_fp8_interpretation_t;
168
+
169
+ /* Forward-declaration of C-style APIs */
170
+
171
+ /**
172
+ * \ingroup CUDA_MATH_FP8_MISC
173
+ * \brief Converts input \p double precision \p x to \p fp8 type of the
174
+ * requested kind using round-to-nearest-even rounding and requested saturation
175
+ * mode.
176
+ *
177
+ * \details Converts input \p x to \p fp8 type of the kind specified by
178
+ * \p fp8_interpretation parameter,
179
+ * using round-to-nearest-even rounding and
180
+ * saturation mode specified by \p saturate parameter.
181
+ *
182
+ * \returns
183
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
184
+ */
185
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
186
+ __nv_cvt_double_to_fp8(const double x, const __nv_saturation_t saturate,
187
+ const __nv_fp8_interpretation_t fp8_interpretation);
188
+
189
+ /**
190
+ * \ingroup CUDA_MATH_FP8_MISC
191
+ * \brief Converts input vector of two \p double precision numbers packed
192
+ * in \p double2 \p x into a vector of two values of \p fp8 type of
193
+ * the requested kind using round-to-nearest-even rounding and requested
194
+ * saturation mode.
195
+ *
196
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
197
+ * kind specified by \p fp8_interpretation parameter, using
198
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
199
+ * parameter.
200
+ *
201
+ * \returns
202
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
203
+ */
204
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
205
+ __nv_cvt_double2_to_fp8x2(const double2 x, const __nv_saturation_t saturate,
206
+ const __nv_fp8_interpretation_t fp8_interpretation);
207
+
208
+ /**
209
+ * \ingroup CUDA_MATH_FP8_MISC
210
+ * \brief Converts input \p single precision \p x to \p fp8 type of the
211
+ * requested kind using round-to-nearest-even rounding and requested saturation
212
+ * mode.
213
+ *
214
+ * \details Converts input \p x to \p fp8 type of the kind specified by
215
+ * \p fp8_interpretation parameter,
216
+ * using round-to-nearest-even rounding and
217
+ * saturation mode specified by \p saturate parameter.
218
+ *
219
+ * \returns
220
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
221
+ */
222
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
223
+ __nv_cvt_float_to_fp8(const float x, const __nv_saturation_t saturate,
224
+ const __nv_fp8_interpretation_t fp8_interpretation);
225
+
226
+ /**
227
+ * \ingroup CUDA_MATH_FP8_MISC
228
+ * \brief Converts input vector of two \p single precision numbers packed
229
+ * in \p float2 \p x into a vector of two values of \p fp8 type of
230
+ * the requested kind using round-to-nearest-even rounding and requested
231
+ * saturation mode.
232
+ *
233
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
234
+ * kind specified by \p fp8_interpretation parameter, using
235
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
236
+ * parameter.
237
+ *
238
+ * \returns
239
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
240
+ */
241
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
242
+ __nv_cvt_float2_to_fp8x2(const float2 x, const __nv_saturation_t saturate,
243
+ const __nv_fp8_interpretation_t fp8_interpretation);
244
+
245
+ /**
246
+ * \ingroup CUDA_MATH_FP8_MISC
247
+ * \brief Converts input \p half precision \p x to \p fp8 type of the requested
248
+ * kind using round-to-nearest-even rounding and requested saturation mode.
249
+ *
250
+ * \details Converts input \p x to \p fp8 type of the kind specified by
251
+ * \p fp8_interpretation parameter,
252
+ * using round-to-nearest-even rounding and
253
+ * saturation mode specified by \p saturate parameter.
254
+ *
255
+ * \returns
256
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
257
+ */
258
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t
259
+ __nv_cvt_halfraw_to_fp8(const __half_raw x, const __nv_saturation_t saturate,
260
+ const __nv_fp8_interpretation_t fp8_interpretation);
261
+
262
+ /**
263
+ * \ingroup CUDA_MATH_FP8_MISC
264
+ * \brief Converts input vector of two \p half precision numbers packed
265
+ * in \p __half2_raw \p x into a vector of two values of \p fp8 type of
266
+ * the requested kind using round-to-nearest-even rounding and requested
267
+ * saturation mode.
268
+ *
269
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
270
+ * kind specified by \p fp8_interpretation parameter, using
271
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
272
+ * parameter.
273
+ *
274
+ * \returns
275
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
276
+ */
277
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_halfraw2_to_fp8x2(
278
+ const __half2_raw x, const __nv_saturation_t saturate,
279
+ const __nv_fp8_interpretation_t fp8_interpretation);
280
+
281
+ /**
282
+ * \ingroup CUDA_MATH_FP8_MISC
283
+ * \brief Converts input \p nv_bfloat16 precision \p x to \p fp8 type of the
284
+ * requested kind using round-to-nearest-even rounding and requested saturation
285
+ * mode.
286
+ *
287
+ * \details Converts input \p x to \p fp8 type of the kind specified by
288
+ * \p fp8_interpretation parameter,
289
+ * using round-to-nearest-even rounding and
290
+ * saturation mode specified by \p saturate parameter.
291
+ *
292
+ * \returns
293
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
294
+ */
295
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_fp8(
296
+ const __nv_bfloat16_raw x, const __nv_saturation_t saturate,
297
+ const __nv_fp8_interpretation_t fp8_interpretation);
298
+
299
+ /**
300
+ * \ingroup CUDA_MATH_FP8_MISC
301
+ * \brief Converts input vector of two \p nv_bfloat16 precision numbers packed
302
+ * in \p __nv_bfloat162_raw \p x into a vector of two values of \p fp8 type of
303
+ * the requested kind using round-to-nearest-even rounding and requested
304
+ * saturation mode.
305
+ *
306
+ * \details Converts input vector \p x to a vector of two \p fp8 values of the
307
+ * kind specified by \p fp8_interpretation parameter, using
308
+ * round-to-nearest-even rounding and saturation mode specified by \p saturate
309
+ * parameter.
310
+ *
311
+ * \returns
312
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
313
+ */
314
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t
315
+ __nv_cvt_bfloat16raw2_to_fp8x2(
316
+ const __nv_bfloat162_raw x, const __nv_saturation_t saturate,
317
+ const __nv_fp8_interpretation_t fp8_interpretation);
318
+
319
+ /**
320
+ * \ingroup CUDA_MATH_FP8_MISC
321
+ * \brief Converts input \p fp8 \p x of the specified kind
322
+ * to \p half precision.
323
+ *
324
+ * \details Converts input \p x of \p fp8 type of the kind specified by
325
+ * \p fp8_interpretation parameter
326
+ * to \p half precision.
327
+ *
328
+ * \returns
329
+ * - The \p __half_raw value holds the result of conversion.
330
+ */
331
+ __CUDA_HOSTDEVICE_FP8_DECL__ __half_raw
332
+ __nv_cvt_fp8_to_halfraw(const __nv_fp8_storage_t x,
333
+ const __nv_fp8_interpretation_t fp8_interpretation);
334
+ /**
335
+ * \ingroup CUDA_MATH_FP8_MISC
336
+ * \brief Converts input vector of two \p fp8 values of the specified kind
337
+ * to a vector of two \p half precision values packed in \p __half2_raw
338
+ * structure.
339
+ *
340
+ * \details Converts input vector \p x of \p fp8 type of the kind specified by
341
+ * \p fp8_interpretation parameter
342
+ * to a vector of two \p half precision values and returns as \p __half2_raw
343
+ * structure.
344
+ *
345
+ * \returns
346
+ * - The \p __half2_raw value holds the result of conversion.
347
+ */
348
+ __CUDA_HOSTDEVICE_FP8_DECL__ __half2_raw
349
+ __nv_cvt_fp8x2_to_halfraw2(const __nv_fp8x2_storage_t x,
350
+ const __nv_fp8_interpretation_t fp8_interpretation);
351
+
352
+
353
+ /**
354
+ * \ingroup CUDA_MATH_FP8_MISC
355
+ * \brief Converts input \p bfloat16 input into a scaling factor of \p e8m0 kind.
356
+ *
357
+ * \details Input number's absolute value is rounded to the closest power of two in the
358
+ * direction specified via \p rounding parameter. Rounded results that are
359
+ * smaller than the smallest representable target format number 2^-127 are then
360
+ * clipped to 2^-127. Results that are larger than the largest representable
361
+ * target format number 2^127 are either clipped to 2^127 if \p saturate equals
362
+ * to \p __NV_SATFINITE, or convert to \p NaN otherwise. \p NaN inputs convert
363
+ * into \p NaN output, encoded as \p 0xFF in the target format.
364
+ *
365
+ * \returns
366
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
367
+ */
368
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_bfloat16raw_to_e8m0(const __nv_bfloat16_raw x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
369
+
370
+ /**
371
+ * \ingroup CUDA_MATH_FP8_MISC
372
+ * \brief Converts a pair of \p bfloat16 values into a pair of scaling factors of \p e8m0 kind.
373
+ *
374
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
375
+ *
376
+ * \returns
377
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
378
+ */
379
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_bfloat162raw_to_e8m0x2(const __nv_bfloat162_raw x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
380
+
381
+ /**
382
+ * \ingroup CUDA_MATH_FP8_MISC
383
+ * \brief Converts input \p float value into a scaling factor of \p e8m0 kind.
384
+ *
385
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
386
+ *
387
+ * \returns
388
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
389
+ */
390
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_float_to_e8m0(const float x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
391
+
392
+ /**
393
+ * \ingroup CUDA_MATH_FP8_MISC
394
+ * \brief Converts a pair of \p float values into a pair of scaling factors of \p e8m0 kind.
395
+ *
396
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
397
+ *
398
+ * \returns
399
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
400
+ */
401
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_float2_to_e8m0x2(const float2 x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
402
+
403
+ /**
404
+ * \ingroup CUDA_MATH_FP8_MISC
405
+ * \brief Converts input \p double value into a scaling factor of \p e8m0 kind.
406
+ *
407
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
408
+ *
409
+ * \returns
410
+ * - The \p __nv_fp8_storage_t value holds the result of conversion.
411
+ */
412
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8_storage_t __nv_cvt_double_to_e8m0(const double x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
413
+
414
+ /**
415
+ * \ingroup CUDA_MATH_FP8_MISC
416
+ * \brief Converts a pair of \p double values into a pair of scaling factors of \p e8m0 kind.
417
+ *
418
+ * \see __nv_cvt_bfloat16raw_to_e8m0() for details of conversion.
419
+ *
420
+ * \returns
421
+ * - The \p __nv_fp8x2_storage_t value holds the result of conversion.
422
+ */
423
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_fp8x2_storage_t __nv_cvt_double2_to_e8m0x2(const double2 x, const __nv_saturation_t saturate, const enum cudaRoundMode rounding);
424
+
425
+ /**
426
+ * \ingroup CUDA_MATH_FP8_MISC
427
+ * \brief Converts input scaling factor value of \p e8m0 kind into \p bfloat16.
428
+ *
429
+ * \details Input scales are exact powers of two or a \p NaN value,
430
+ * also representable in the target format.
431
+ *
432
+ * \returns
433
+ * - The \p __nv_bfloat16_raw value holds the result of conversion.
434
+ */
435
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat16_raw __nv_cvt_e8m0_to_bf16raw(const __nv_fp8_storage_t x);
436
+
437
+ /**
438
+ * \ingroup CUDA_MATH_FP8_MISC
439
+ * \brief Converts input pair of scaling factors of \p e8m0 kind into a pair of \p bfloat16 values.
440
+ *
441
+ * \returns
442
+ * - The \p __nv_bfloat162_raw value holds the result of conversion.
443
+ */
444
+ __CUDA_HOSTDEVICE_FP8_DECL__ __nv_bfloat162_raw __nv_cvt_e8m0x2_to_bf162raw(const __nv_fp8x2_storage_t x);
445
+
446
+ #if defined(__cplusplus)
447
+
448
+ #define __CUDA_FP8_TYPES_EXIST__
449
+
450
+ /* Forward-declaration of structures defined in "cuda_fp8.hpp" */
451
+ struct __nv_fp8_e5m2;
452
+ struct __nv_fp8x2_e5m2;
453
+ struct __nv_fp8x4_e5m2;
454
+
455
+ struct __nv_fp8_e4m3;
456
+ struct __nv_fp8x2_e4m3;
457
+ struct __nv_fp8x4_e4m3;
458
+
459
+ struct __nv_fp8_e8m0;
460
+ struct __nv_fp8x2_e8m0;
461
+ struct __nv_fp8x4_e8m0;
462
+
463
+ #endif /* defined(__cplusplus) */
464
+
465
+ #include "cuda_fp8.hpp"
466
+
467
+ #undef __CUDA_FP8_DECL__
468
+ #undef __CUDA_HOSTDEVICE_FP8__
469
+ #undef __CUDA_HOSTDEVICE_FP8_DECL__
470
+
471
+ #if defined(__CPP_VERSION_AT_LEAST_11_FP8)
472
+ #undef __CPP_VERSION_AT_LEAST_11_FP8
473
+ #endif /* defined(__CPP_VERSION_AT_LEAST_11_FP8) */
474
+
475
+ #endif /* end of include guard: __CUDA_FP8_H__ */
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_occupancy.h ADDED
@@ -0,0 +1,2094 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /**
51
+ * CUDA Occupancy Calculator
52
+ *
53
+ * NAME
54
+ *
55
+ * cudaOccMaxActiveBlocksPerMultiprocessor,
56
+ * cudaOccMaxPotentialOccupancyBlockSize,
57
+ * cudaOccMaxPotentialOccupancyBlockSizeVariableSMem
58
+ * cudaOccAvailableDynamicSMemPerBlock
59
+ *
60
+ * DESCRIPTION
61
+ *
62
+ * The CUDA occupancy calculator provides a standalone, programmatical
63
+ * interface to compute the occupancy of a function on a device. It can also
64
+ * provide occupancy-oriented launch configuration suggestions.
65
+ *
66
+ * The function and device are defined by the user through
67
+ * cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState
68
+ * structures. All APIs require all 3 of them.
69
+ *
70
+ * See the structure definition for more details about the device / function
71
+ * descriptors.
72
+ *
73
+ * See each API's prototype for API usage.
74
+ *
75
+ * COMPATIBILITY
76
+ *
77
+ * The occupancy calculator will be updated on each major CUDA toolkit
78
+ * release. It does not provide forward compatibility, i.e. new hardwares
79
+ * released after this implementation's release will not be supported.
80
+ *
81
+ * NOTE
82
+ *
83
+ * If there is access to CUDA runtime, and the sole intent is to calculate
84
+ * occupancy related values on one of the accessible CUDA devices, using CUDA
85
+ * runtime's occupancy calculation APIs is recommended.
86
+ *
87
+ */
88
+
89
+ #ifndef __cuda_occupancy_h__
90
+ #define __cuda_occupancy_h__
91
+
92
+ #include <stddef.h>
93
+ #include <limits.h>
94
+ #include <string.h>
95
+
96
+
97
+ // __OCC_INLINE will be undefined at the end of this header
98
+ //
99
+ #ifdef __CUDACC__
100
+ #define __OCC_INLINE inline __host__ __device__
101
+ #elif defined _MSC_VER
102
+ #define __OCC_INLINE __inline
103
+ #else // GNUCC assumed
104
+ #define __OCC_INLINE inline
105
+ #endif
106
+
107
+ enum cudaOccError_enum {
108
+ CUDA_OCC_SUCCESS = 0, // no error encountered
109
+ CUDA_OCC_ERROR_INVALID_INPUT = 1, // input parameter is invalid
110
+ CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2, // requested device is not supported in
111
+ // current implementation or device is
112
+ // invalid
113
+ };
114
+ typedef enum cudaOccError_enum cudaOccError;
115
+
116
+ typedef struct cudaOccResult cudaOccResult;
117
+ typedef struct cudaOccDeviceProp cudaOccDeviceProp;
118
+ typedef struct cudaOccFuncAttributes cudaOccFuncAttributes;
119
+ typedef struct cudaOccDeviceState cudaOccDeviceState;
120
+
121
+ /**
122
+ * The CUDA occupancy calculator computes the occupancy of the function
123
+ * described by attributes with the given block size (blockSize), static device
124
+ * properties (properties), dynamic device states (states) and per-block dynamic
125
+ * shared memory allocation (dynamicSMemSize) in bytes, and output it through
126
+ * result along with other useful information. The occupancy is computed in
127
+ * terms of the maximum number of active blocks per multiprocessor. The user can
128
+ * then convert it to other metrics, such as number of active warps.
129
+ *
130
+ * RETURN VALUE
131
+ *
132
+ * The occupancy and related information is returned through result.
133
+ *
134
+ * If result->activeBlocksPerMultiprocessor is 0, then the given parameter
135
+ * combination cannot run on the device.
136
+ *
137
+ * ERRORS
138
+ *
139
+ * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
140
+ * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
141
+ * current implementation or device is invalid
142
+ */
143
+ static __OCC_INLINE
144
+ cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
145
+ cudaOccResult *result, // out
146
+ const cudaOccDeviceProp *properties, // in
147
+ const cudaOccFuncAttributes *attributes, // in
148
+ const cudaOccDeviceState *state, // in
149
+ int blockSize, // in
150
+ size_t dynamicSmemSize); // in
151
+
152
+ /**
153
+ * The CUDA launch configurator C API suggests a grid / block size pair (in
154
+ * minGridSize and blockSize) that achieves the best potential occupancy
155
+ * (i.e. maximum number of active warps with the smallest number of blocks) for
156
+ * the given function described by attributes, on a device described by
157
+ * properties with settings in state.
158
+ *
159
+ * If per-block dynamic shared memory allocation is not needed, the user should
160
+ * leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0.
161
+ *
162
+ * If per-block dynamic shared memory allocation is needed, then if the dynamic
163
+ * shared memory size is constant regardless of block size, the size should be
164
+ * passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be
165
+ * NULL.
166
+ *
167
+ * Otherwise, if the per-block dynamic shared memory size varies with different
168
+ * block sizes, the user needs to provide a pointer to an unary function through
169
+ * blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by
170
+ * a block of the function for any given block size. dynamicSMemSize is
171
+ * ignored. An example signature is:
172
+ *
173
+ * // Take block size, returns dynamic shared memory needed
174
+ * size_t blockToSmem(int blockSize);
175
+ *
176
+ * RETURN VALUE
177
+ *
178
+ * The suggested block size and the minimum number of blocks needed to achieve
179
+ * the maximum occupancy are returned through blockSize and minGridSize.
180
+ *
181
+ * If *blockSize is 0, then the given combination cannot run on the device.
182
+ *
183
+ * ERRORS
184
+ *
185
+ * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
186
+ * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
187
+ * current implementation or device is invalid
188
+ *
189
+ */
190
+ static __OCC_INLINE
191
+ cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
192
+ int *minGridSize, // out
193
+ int *blockSize, // out
194
+ const cudaOccDeviceProp *properties, // in
195
+ const cudaOccFuncAttributes *attributes, // in
196
+ const cudaOccDeviceState *state, // in
197
+ size_t (*blockSizeToDynamicSMemSize)(int), // in
198
+ size_t dynamicSMemSize); // in
199
+
200
+ /**
201
+ * The CUDA launch configurator C++ API suggests a grid / block size pair (in
202
+ * minGridSize and blockSize) that achieves the best potential occupancy
203
+ * (i.e. the maximum number of active warps with the smallest number of blocks)
204
+ * for the given function described by attributes, on a device described by
205
+ * properties with settings in state.
206
+ *
207
+ * If per-block dynamic shared memory allocation is 0 or constant regardless of
208
+ * block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to
209
+ * configure the launch. A constant dynamic shared memory allocation size in
210
+ * bytes can be passed through dynamicSMemSize.
211
+ *
212
+ * Otherwise, if the per-block dynamic shared memory size varies with different
213
+ * block sizes, the user needs to use
214
+ * cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a
215
+ * functor / pointer to an unary function (blockSizeToDynamicSMemSize) that
216
+ * computes the dynamic shared memory needed by func for any given block
217
+ * size. An example signature is:
218
+ *
219
+ * // Take block size, returns per-block dynamic shared memory needed
220
+ * size_t blockToSmem(int blockSize);
221
+ *
222
+ * RETURN VALUE
223
+ *
224
+ * The suggested block size and the minimum number of blocks needed to achieve
225
+ * the maximum occupancy are returned through blockSize and minGridSize.
226
+ *
227
+ * If *blockSize is 0, then the given combination cannot run on the device.
228
+ *
229
+ * ERRORS
230
+ *
231
+ * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
232
+ * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
233
+ * current implementation or device is invalid
234
+ *
235
+ */
236
+
237
+ #if defined(__cplusplus)
238
+ namespace {
239
+
240
+ __OCC_INLINE
241
+ cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
242
+ int *minGridSize, // out
243
+ int *blockSize, // out
244
+ const cudaOccDeviceProp *properties, // in
245
+ const cudaOccFuncAttributes *attributes, // in
246
+ const cudaOccDeviceState *state, // in
247
+ size_t dynamicSMemSize = 0); // in
248
+
249
+ template <typename UnaryFunction>
250
+ __OCC_INLINE
251
+ cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
252
+ int *minGridSize, // out
253
+ int *blockSize, // out
254
+ const cudaOccDeviceProp *properties, // in
255
+ const cudaOccFuncAttributes *attributes, // in
256
+ const cudaOccDeviceState *state, // in
257
+ UnaryFunction blockSizeToDynamicSMemSize); // in
258
+
259
+ } // namespace anonymous
260
+ #endif // defined(__cplusplus)
261
+
262
+ /**
263
+ *
264
+ * The CUDA dynamic shared memory calculator computes the maximum size of
265
+ * per-block dynamic shared memory if we want to place numBlocks blocks
266
+ * on an SM.
267
+ *
268
+ * RETURN VALUE
269
+ *
270
+ * Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow
271
+ * numBlocks blocks per SM.
272
+ *
273
+ * ERRORS
274
+ *
275
+ * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
276
+ * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
277
+ * current implementation or device is invalid
278
+ *
279
+ */
280
+ static __OCC_INLINE
281
+ cudaOccError cudaOccAvailableDynamicSMemPerBlock(
282
+ size_t *dynamicSmemSize,
283
+ const cudaOccDeviceProp *properties,
284
+ const cudaOccFuncAttributes *attributes,
285
+ const cudaOccDeviceState *state,
286
+ int numBlocks,
287
+ int blockSize);
288
+
289
+ /**
290
+ * Data structures
291
+ *
292
+ * These structures are subject to change for future architecture and CUDA
293
+ * releases. C users should initialize the structure as {0}.
294
+ *
295
+ */
296
+
297
+ /**
298
+ * Device descriptor
299
+ *
300
+ * This structure describes a device.
301
+ */
302
+ struct cudaOccDeviceProp {
303
+ int computeMajor; // Compute capability major version
304
+ int computeMinor; // Compute capability minor
305
+ // version. None supported minor version
306
+ // may cause error
307
+ int maxThreadsPerBlock; // Maximum number of threads per block
308
+ int maxThreadsPerMultiprocessor; // Maximum number of threads per SM
309
+ // i.e. (Max. number of warps) x (warp
310
+ // size)
311
+ int regsPerBlock; // Maximum number of registers per block
312
+ int regsPerMultiprocessor; // Maximum number of registers per SM
313
+ int warpSize; // Warp size
314
+ size_t sharedMemPerBlock; // Maximum shared memory size per block
315
+ size_t sharedMemPerMultiprocessor; // Maximum shared memory size per SM
316
+ int numSms; // Number of SMs available
317
+ size_t sharedMemPerBlockOptin; // Maximum optin shared memory size per block
318
+ size_t reservedSharedMemPerBlock; // Shared memory per block reserved by driver
319
+
320
+ #ifdef __cplusplus
321
+ // This structure can be converted from a cudaDeviceProp structure for users
322
+ // that use this header in their CUDA applications.
323
+ //
324
+ // If the application have access to the CUDA Runtime API, the application
325
+ // can obtain the device properties of a CUDA device through
326
+ // cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the
327
+ // cudaDeviceProp structure.
328
+ //
329
+ // Example:
330
+ /*
331
+ {
332
+ cudaDeviceProp prop;
333
+
334
+ cudaGetDeviceProperties(&prop, ...);
335
+
336
+ cudaOccDeviceProp occProp = prop;
337
+
338
+ ...
339
+
340
+ cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...);
341
+ }
342
+ */
343
+ //
344
+ template<typename DeviceProp>
345
+ __OCC_INLINE
346
+ cudaOccDeviceProp(const DeviceProp &props)
347
+ : computeMajor (props.major),
348
+ computeMinor (props.minor),
349
+ maxThreadsPerBlock (props.maxThreadsPerBlock),
350
+ maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor),
351
+ regsPerBlock (props.regsPerBlock),
352
+ regsPerMultiprocessor (props.regsPerMultiprocessor),
353
+ warpSize (props.warpSize),
354
+ sharedMemPerBlock (props.sharedMemPerBlock),
355
+ sharedMemPerMultiprocessor (props.sharedMemPerMultiprocessor),
356
+ numSms (props.multiProcessorCount),
357
+ sharedMemPerBlockOptin (props.sharedMemPerBlockOptin),
358
+ reservedSharedMemPerBlock (props.reservedSharedMemPerBlock)
359
+ {}
360
+
361
+ __OCC_INLINE
362
+ cudaOccDeviceProp()
363
+ : computeMajor (0),
364
+ computeMinor (0),
365
+ maxThreadsPerBlock (0),
366
+ maxThreadsPerMultiprocessor (0),
367
+ regsPerBlock (0),
368
+ regsPerMultiprocessor (0),
369
+ warpSize (0),
370
+ sharedMemPerBlock (0),
371
+ sharedMemPerMultiprocessor (0),
372
+ numSms (0),
373
+ sharedMemPerBlockOptin (0),
374
+ reservedSharedMemPerBlock (0)
375
+ {}
376
+ #endif // __cplusplus
377
+ };
378
+
379
+ /**
380
+ * Partitioned global caching option
381
+ */
382
+ typedef enum cudaOccPartitionedGCConfig_enum {
383
+ PARTITIONED_GC_OFF, // Disable partitioned global caching
384
+ PARTITIONED_GC_ON, // Prefer partitioned global caching
385
+ PARTITIONED_GC_ON_STRICT // Force partitioned global caching
386
+ } cudaOccPartitionedGCConfig;
387
+
388
+ /**
389
+ * Per function opt in maximum dynamic shared memory limit
390
+ */
391
+ typedef enum cudaOccFuncShmemConfig_enum {
392
+ FUNC_SHMEM_LIMIT_DEFAULT, // Default shmem limit
393
+ FUNC_SHMEM_LIMIT_OPTIN, // Use the optin shmem limit
394
+ } cudaOccFuncShmemConfig;
395
+
396
+ /**
397
+ * Function descriptor
398
+ *
399
+ * This structure describes a CUDA function.
400
+ */
401
+ struct cudaOccFuncAttributes {
402
+ int maxThreadsPerBlock; // Maximum block size the function can work with. If
403
+ // unlimited, use INT_MAX or any value greater than
404
+ // or equal to maxThreadsPerBlock of the device
405
+ int numRegs; // Number of registers used. When the function is
406
+ // launched on device, the register count may change
407
+ // due to internal tools requirements.
408
+ size_t sharedSizeBytes; // Number of static shared memory used
409
+
410
+ cudaOccPartitionedGCConfig partitionedGCConfig;
411
+ // Partitioned global caching is required to enable
412
+ // caching on certain chips, such as sm_52
413
+ // devices. Partitioned global caching can be
414
+ // automatically disabled if the occupancy
415
+ // requirement of the launch cannot support caching.
416
+ //
417
+ // To override this behavior with caching on and
418
+ // calculate occupancy strictly according to the
419
+ // preference, set partitionedGCConfig to
420
+ // PARTITIONED_GC_ON_STRICT. This is especially
421
+ // useful for experimenting and finding launch
422
+ // configurations (MaxPotentialOccupancyBlockSize)
423
+ // that allow global caching to take effect.
424
+ //
425
+ // This flag only affects the occupancy calculation.
426
+
427
+ cudaOccFuncShmemConfig shmemLimitConfig;
428
+ // Certain chips like sm_70 allow a user to opt into
429
+ // a higher per block limit of dynamic shared memory
430
+ // This optin is performed on a per function basis
431
+ // using the cuFuncSetAttribute function
432
+
433
+ size_t maxDynamicSharedSizeBytes;
434
+ // User set limit on maximum dynamic shared memory
435
+ // usable by the kernel
436
+ // This limit is set using the cuFuncSetAttribute
437
+ // function.
438
+
439
+ int numBlockBarriers; // Number of block barriers used (default to 1)
440
+ #ifdef __cplusplus
441
+ // This structure can be converted from a cudaFuncAttributes structure for
442
+ // users that use this header in their CUDA applications.
443
+ //
444
+ // If the application have access to the CUDA Runtime API, the application
445
+ // can obtain the function attributes of a CUDA kernel function through
446
+ // cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the
447
+ // cudaFuncAttributes structure.
448
+ //
449
+ // Example:
450
+ /*
451
+ __global__ void foo() {...}
452
+
453
+ ...
454
+
455
+ {
456
+ cudaFuncAttributes attr;
457
+
458
+ cudaFuncGetAttributes(&attr, foo);
459
+
460
+ cudaOccFuncAttributes occAttr = attr;
461
+
462
+ ...
463
+
464
+ cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...);
465
+ }
466
+ */
467
+ //
468
+ template<typename FuncAttributes>
469
+ __OCC_INLINE
470
+ cudaOccFuncAttributes(const FuncAttributes &attr)
471
+ : maxThreadsPerBlock (attr.maxThreadsPerBlock),
472
+ numRegs (attr.numRegs),
473
+ sharedSizeBytes (attr.sharedSizeBytes),
474
+ partitionedGCConfig (PARTITIONED_GC_OFF),
475
+ shmemLimitConfig (FUNC_SHMEM_LIMIT_OPTIN),
476
+ maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes),
477
+ numBlockBarriers (1)
478
+ {}
479
+
480
+ __OCC_INLINE
481
+ cudaOccFuncAttributes()
482
+ : maxThreadsPerBlock (0),
483
+ numRegs (0),
484
+ sharedSizeBytes (0),
485
+ partitionedGCConfig (PARTITIONED_GC_OFF),
486
+ shmemLimitConfig (FUNC_SHMEM_LIMIT_DEFAULT),
487
+ maxDynamicSharedSizeBytes (0),
488
+ numBlockBarriers (0)
489
+ {}
490
+ #endif
491
+ };
492
+
493
+ typedef enum cudaOccCacheConfig_enum {
494
+ CACHE_PREFER_NONE = 0x00, // no preference for shared memory or L1 (default)
495
+ CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache
496
+ CACHE_PREFER_L1 = 0x02, // prefer larger L1 cache and smaller shared memory
497
+ CACHE_PREFER_EQUAL = 0x03 // prefer equal sized L1 cache and shared memory
498
+ } cudaOccCacheConfig;
499
+
500
+ typedef enum cudaOccCarveoutConfig_enum {
501
+ SHAREDMEM_CARVEOUT_DEFAULT = -1, // no preference for shared memory or L1 (default)
502
+ SHAREDMEM_CARVEOUT_MAX_SHARED = 100, // prefer maximum available shared memory, minimum L1 cache
503
+ SHAREDMEM_CARVEOUT_MAX_L1 = 0, // prefer maximum available L1 cache, minimum shared memory
504
+ SHAREDMEM_CARVEOUT_HALF = 50 // prefer half of maximum available shared memory, with the rest as L1 cache
505
+ } cudaOccCarveoutConfig;
506
+
507
+ /**
508
+ * Device state descriptor
509
+ *
510
+ * This structure describes device settings that affect occupancy calculation.
511
+ */
512
+ struct cudaOccDeviceState
513
+ {
514
+ // Cache / shared memory split preference. Deprecated on Volta
515
+ cudaOccCacheConfig cacheConfig;
516
+ // Shared memory / L1 split preference. Supported on only Volta
517
+ int carveoutConfig;
518
+
519
+ #ifdef __cplusplus
520
+ __OCC_INLINE
521
+ cudaOccDeviceState()
522
+ : cacheConfig (CACHE_PREFER_NONE),
523
+ carveoutConfig (SHAREDMEM_CARVEOUT_DEFAULT)
524
+ {}
525
+ #endif
526
+ };
527
+
528
+ typedef enum cudaOccLimitingFactor_enum {
529
+ // Occupancy limited due to:
530
+ OCC_LIMIT_WARPS = 0x01, // - warps available
531
+ OCC_LIMIT_REGISTERS = 0x02, // - registers available
532
+ OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available
533
+ OCC_LIMIT_BLOCKS = 0x08, // - blocks available
534
+ OCC_LIMIT_BARRIERS = 0x10 // - barrier available
535
+ } cudaOccLimitingFactor;
536
+
537
+ /**
538
+ * Occupancy output
539
+ *
540
+ * This structure contains occupancy calculator's output.
541
+ */
542
+ struct cudaOccResult {
543
+ int activeBlocksPerMultiprocessor; // Occupancy
544
+ unsigned int limitingFactors; // Factors that limited occupancy. A bit
545
+ // field that counts the limiting
546
+ // factors, see cudaOccLimitingFactor
547
+ int blockLimitRegs; // Occupancy due to register
548
+ // usage, INT_MAX if the kernel does not
549
+ // use any register.
550
+ int blockLimitSharedMem; // Occupancy due to shared memory
551
+ // usage, INT_MAX if the kernel does not
552
+ // use shared memory.
553
+ int blockLimitWarps; // Occupancy due to block size limit
554
+ int blockLimitBlocks; // Occupancy due to maximum number of blocks
555
+ // managable per SM
556
+ int blockLimitBarriers; // Occupancy due to block barrier usage
557
+ int allocatedRegistersPerBlock; // Actual number of registers allocated per
558
+ // block
559
+ size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated
560
+ // per block
561
+ cudaOccPartitionedGCConfig partitionedGCConfig;
562
+ // Report if partitioned global caching
563
+ // is actually enabled.
564
+ };
565
+
566
+ /**
567
+ * Partitioned global caching support
568
+ *
569
+ * See cudaOccPartitionedGlobalCachingModeSupport
570
+ */
571
+ typedef enum cudaOccPartitionedGCSupport_enum {
572
+ PARTITIONED_GC_NOT_SUPPORTED, // Partitioned global caching is not supported
573
+ PARTITIONED_GC_SUPPORTED, // Partitioned global caching is supported
574
+ } cudaOccPartitionedGCSupport;
575
+
576
+ /**
577
+ * Implementation
578
+ */
579
+
580
+ /**
581
+ * Max compute capability supported
582
+ */
583
+
584
+ #define __CUDA_OCC_MAJOR__ 12
585
+ #define __CUDA_OCC_MINOR__ 0
586
+
587
+ //////////////////////////////////////////
588
+ // Mathematical Helper Functions //
589
+ //////////////////////////////////////////
590
+
591
+ static __OCC_INLINE int __occMin(int lhs, int rhs)
592
+ {
593
+ return rhs < lhs ? rhs : lhs;
594
+ }
595
+
596
+ static __OCC_INLINE int __occDivideRoundUp(int x, int y)
597
+ {
598
+ return (x + (y - 1)) / y;
599
+ }
600
+
601
+ static __OCC_INLINE int __occRoundUp(int x, int y)
602
+ {
603
+ return y * __occDivideRoundUp(x, y);
604
+ }
605
+
606
+ //////////////////////////////////////////
607
+ // Architectural Properties //
608
+ //////////////////////////////////////////
609
+
610
+ /**
611
+ * Granularity of shared memory allocation
612
+ */
613
+ static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
614
+ {
615
+ int value;
616
+
617
+ switch(properties->computeMajor) {
618
+ case 3:
619
+ case 5:
620
+ case 6:
621
+ case 7:
622
+ value = 256;
623
+ break;
624
+ case 8:
625
+ case 9:
626
+ case 10:
627
+ case 12:
628
+ value = 128;
629
+ break;
630
+ default:
631
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
632
+ }
633
+
634
+ *limit = value;
635
+
636
+ return CUDA_OCC_SUCCESS;
637
+ }
638
+
639
+ /**
640
+ * Maximum number of registers per thread
641
+ */
642
+ static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties)
643
+ {
644
+ int value;
645
+
646
+ switch(properties->computeMajor) {
647
+ case 3:
648
+ case 5:
649
+ case 6:
650
+ value = 255;
651
+ break;
652
+ case 7:
653
+ case 8:
654
+ case 9:
655
+ case 10:
656
+ case 12:
657
+ value = 256;
658
+ break;
659
+ default:
660
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
661
+ }
662
+
663
+ *limit = value;
664
+
665
+ return CUDA_OCC_SUCCESS;
666
+ }
667
+
668
+ /**
669
+ * Granularity of register allocation
670
+ */
671
+ static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
672
+ {
673
+ int value;
674
+
675
+ switch(properties->computeMajor) {
676
+ case 3:
677
+ case 5:
678
+ case 6:
679
+ case 7:
680
+ case 8:
681
+ case 9:
682
+ case 10:
683
+ case 12:
684
+ value = 256;
685
+ break;
686
+ default:
687
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
688
+ }
689
+
690
+ *limit = value;
691
+
692
+ return CUDA_OCC_SUCCESS;
693
+ }
694
+
695
+ /**
696
+ * Number of sub-partitions
697
+ */
698
+ static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties)
699
+ {
700
+ int value;
701
+
702
+ switch(properties->computeMajor) {
703
+ case 3:
704
+ case 5:
705
+ case 7:
706
+ case 8:
707
+ case 9:
708
+ case 10:
709
+ case 12:
710
+ value = 4;
711
+ break;
712
+ case 6:
713
+ value = properties->computeMinor ? 4 : 2;
714
+ break;
715
+ default:
716
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
717
+ }
718
+
719
+ *limit = value;
720
+
721
+ return CUDA_OCC_SUCCESS;
722
+ }
723
+
724
+
725
+ /**
726
+ * Maximum number of blocks that can run simultaneously on a multiprocessor
727
+ */
728
+ static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties)
729
+ {
730
+ int value;
731
+
732
+ switch(properties->computeMajor) {
733
+ case 3:
734
+ value = 16;
735
+ break;
736
+ case 5:
737
+ case 6:
738
+ value = 32;
739
+ break;
740
+ case 7: {
741
+ int isTuring = properties->computeMinor == 5;
742
+ value = (isTuring) ? 16 : 32;
743
+ break;
744
+ }
745
+ case 8:
746
+ if (properties->computeMinor == 0) {
747
+ value = 32;
748
+ }
749
+ else if (properties->computeMinor == 9) {
750
+ value = 24;
751
+ }
752
+ else {
753
+ value = 16;
754
+ }
755
+ break;
756
+ case 9:
757
+ value = 32;
758
+ break;
759
+ case 10:
760
+ switch(properties->computeMinor) {
761
+ case 1 :
762
+ value = 24;
763
+ break;
764
+ case 0 : /* explicitly added to avoid build failure in WDDM driver components */
765
+ default :
766
+ value = 32;
767
+ }
768
+ break;
769
+ case 12:
770
+ value = 24;
771
+ break;
772
+ default:
773
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
774
+ }
775
+
776
+ *limit = value;
777
+
778
+ return CUDA_OCC_SUCCESS;
779
+ }
780
+
781
+ /**
782
+ * Align up shared memory based on compute major configurations
783
+ */
784
+ static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties)
785
+ {
786
+ // Volta and Turing have shared L1 cache / shared memory, and support cache
787
+ // configuration to trade one for the other. These values are needed to
788
+ // map carveout config ratio to the next available architecture size
789
+ size_t size = *shMemSize;
790
+
791
+ switch (properties->computeMajor) {
792
+ case 7: {
793
+ // Turing supports 32KB and 64KB shared mem.
794
+ int isTuring = properties->computeMinor == 5;
795
+ if (isTuring) {
796
+ if (size <= 32 * 1024) {
797
+ *shMemSize = 32 * 1024;
798
+ }
799
+ else if (size <= 64 * 1024) {
800
+ *shMemSize = 64 * 1024;
801
+ }
802
+ else {
803
+ return CUDA_OCC_ERROR_INVALID_INPUT;
804
+ }
805
+ }
806
+ // Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem.
807
+ else {
808
+ if (size == 0) {
809
+ *shMemSize = 0;
810
+ }
811
+ else if (size <= 8 * 1024) {
812
+ *shMemSize = 8 * 1024;
813
+ }
814
+ else if (size <= 16 * 1024) {
815
+ *shMemSize = 16 * 1024;
816
+ }
817
+ else if (size <= 32 * 1024) {
818
+ *shMemSize = 32 * 1024;
819
+ }
820
+ else if (size <= 64 * 1024) {
821
+ *shMemSize = 64 * 1024;
822
+ }
823
+ else if (size <= 96 * 1024) {
824
+ *shMemSize = 96 * 1024;
825
+ }
826
+ else {
827
+ return CUDA_OCC_ERROR_INVALID_INPUT;
828
+ }
829
+ }
830
+ break;
831
+ }
832
+ case 8:
833
+ if (properties->computeMinor == 0 || properties->computeMinor == 7) {
834
+ if (size == 0) {
835
+ *shMemSize = 0;
836
+ }
837
+ else if (size <= 8 * 1024) {
838
+ *shMemSize = 8 * 1024;
839
+ }
840
+ else if (size <= 16 * 1024) {
841
+ *shMemSize = 16 * 1024;
842
+ }
843
+ else if (size <= 32 * 1024) {
844
+ *shMemSize = 32 * 1024;
845
+ }
846
+ else if (size <= 64 * 1024) {
847
+ *shMemSize = 64 * 1024;
848
+ }
849
+ else if (size <= 100 * 1024) {
850
+ *shMemSize = 100 * 1024;
851
+ }
852
+ else if (size <= 132 * 1024) {
853
+ *shMemSize = 132 * 1024;
854
+ }
855
+ else if (size <= 164 * 1024) {
856
+ *shMemSize = 164 * 1024;
857
+ }
858
+ else {
859
+ return CUDA_OCC_ERROR_INVALID_INPUT;
860
+ }
861
+ }
862
+ else {
863
+ if (size == 0) {
864
+ *shMemSize = 0;
865
+ }
866
+ else if (size <= 8 * 1024) {
867
+ *shMemSize = 8 * 1024;
868
+ }
869
+ else if (size <= 16 * 1024) {
870
+ *shMemSize = 16 * 1024;
871
+ }
872
+ else if (size <= 32 * 1024) {
873
+ *shMemSize = 32 * 1024;
874
+ }
875
+ else if (size <= 64 * 1024) {
876
+ *shMemSize = 64 * 1024;
877
+ }
878
+ else if (size <= 100 * 1024) {
879
+ *shMemSize = 100 * 1024;
880
+ }
881
+ else {
882
+ return CUDA_OCC_ERROR_INVALID_INPUT;
883
+ }
884
+ }
885
+ break;
886
+ case 9: {
887
+ if (size == 0) {
888
+ *shMemSize = 0;
889
+ }
890
+ else if (size <= 8 * 1024) {
891
+ *shMemSize = 8 * 1024;
892
+ }
893
+ else if (size <= 16 * 1024) {
894
+ *shMemSize = 16 * 1024;
895
+ }
896
+ else if (size <= 32 * 1024) {
897
+ *shMemSize = 32 * 1024;
898
+ }
899
+ else if (size <= 64 * 1024) {
900
+ *shMemSize = 64 * 1024;
901
+ }
902
+ else if (size <= 100 * 1024) {
903
+ *shMemSize = 100 * 1024;
904
+ }
905
+ else if (size <= 132 * 1024) {
906
+ *shMemSize = 132 * 1024;
907
+ }
908
+ else if (size <= 164 * 1024) {
909
+ *shMemSize = 164 * 1024;
910
+ }
911
+ else if (size <= 196 * 1024) {
912
+ *shMemSize = 196 * 1024;
913
+ }
914
+ else if (size <= 228 * 1024) {
915
+ *shMemSize = 228 * 1024;
916
+ }
917
+ else {
918
+ return CUDA_OCC_ERROR_INVALID_INPUT;
919
+ }
920
+ break;
921
+ }
922
+ case 10: {
923
+ switch (properties->computeMinor) {
924
+ // GB10x GPUs in Blackwell family have the below compute minors and corresponding
925
+ // shared memory configs
926
+ case 0:
927
+ case 1:
928
+ if (size == 0) {
929
+ *shMemSize = 0;
930
+ }
931
+ else if (size <= 8 * 1024) {
932
+ *shMemSize = 8 * 1024;
933
+ }
934
+ else if (size <= 16 * 1024) {
935
+ *shMemSize = 16 * 1024;
936
+ }
937
+ else if (size <= 32 * 1024) {
938
+ *shMemSize = 32 * 1024;
939
+ }
940
+ else if (size <= 64 * 1024) {
941
+ *shMemSize = 64 * 1024;
942
+ }
943
+ else if (size <= 100 * 1024) {
944
+ *shMemSize = 100 * 1024;
945
+ }
946
+ else if (size <= 132 * 1024) {
947
+ *shMemSize = 132 * 1024;
948
+ }
949
+ else if (size <= 164 * 1024) {
950
+ *shMemSize = 164 * 1024;
951
+ }
952
+ else if (size <= 196 * 1024) {
953
+ *shMemSize = 196 * 1024;
954
+ }
955
+ else if (size <= 228 * 1024) {
956
+ *shMemSize = 228 * 1024;
957
+ }
958
+ else {
959
+ return CUDA_OCC_ERROR_INVALID_INPUT;
960
+ }
961
+ break;
962
+ default:
963
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
964
+ }
965
+ break;
966
+ }
967
+ case 12: {
968
+ switch (properties->computeMinor) {
969
+ case 0:
970
+ if (size == 0) {
971
+ *shMemSize = 0;
972
+ }
973
+ else if (size <= 8 * 1024) {
974
+ *shMemSize = 8 * 1024;
975
+ }
976
+ else if (size <= 16 * 1024) {
977
+ *shMemSize = 16 * 1024;
978
+ }
979
+ else if (size <= 32 * 1024) {
980
+ *shMemSize = 32 * 1024;
981
+ }
982
+ else if (size <= 64 * 1024) {
983
+ *shMemSize = 64 * 1024;
984
+ }
985
+ else if (size <= 100 * 1024) {
986
+ *shMemSize = 100 * 1024;
987
+ }
988
+ else {
989
+ return CUDA_OCC_ERROR_INVALID_INPUT;
990
+ }
991
+ break;
992
+ default:
993
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
994
+ }
995
+ break;
996
+ }
997
+ break;
998
+ default:
999
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
1000
+ }
1001
+
1002
+ return CUDA_OCC_SUCCESS;
1003
+ }
1004
+
1005
+ /**
1006
+ * Shared memory based on the new carveoutConfig API introduced with Volta
1007
+ */
1008
+ static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
1009
+ {
1010
+ cudaOccError status = CUDA_OCC_SUCCESS;
1011
+ size_t preferenceShmemSize;
1012
+
1013
+ // CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported
1014
+ // devices. This preference will take precedence over the older cacheConfig setting.
1015
+ // Map cacheConfig to its effective preference value.
1016
+ int effectivePreference = state->carveoutConfig;
1017
+ if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) {
1018
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1019
+ }
1020
+
1021
+ if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
1022
+ switch (state->cacheConfig)
1023
+ {
1024
+ case CACHE_PREFER_L1:
1025
+ effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1;
1026
+ break;
1027
+ case CACHE_PREFER_SHARED:
1028
+ effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED;
1029
+ break;
1030
+ case CACHE_PREFER_EQUAL:
1031
+ effectivePreference = SHAREDMEM_CARVEOUT_HALF;
1032
+ break;
1033
+ default:
1034
+ effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT;
1035
+ break;
1036
+ }
1037
+ }
1038
+
1039
+ if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
1040
+ preferenceShmemSize = properties->sharedMemPerMultiprocessor;
1041
+ }
1042
+ else {
1043
+ preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100;
1044
+ }
1045
+
1046
+ status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties);
1047
+ *limit = preferenceShmemSize;
1048
+ return status;
1049
+ }
1050
+
1051
+ /**
1052
+ * Shared memory based on the cacheConfig
1053
+ */
1054
+ static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
1055
+ {
1056
+ size_t bytes = 0;
1057
+ size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor;
1058
+ cudaOccCacheConfig cacheConfig = state->cacheConfig;
1059
+
1060
+ // Kepler has shared L1 cache / shared memory, and support cache
1061
+ // configuration to trade one for the other. These values are needed to
1062
+ // calculate the correct shared memory size for user requested cache
1063
+ // configuration.
1064
+ //
1065
+ size_t minCacheSize = 16384;
1066
+ size_t maxCacheSize = 49152;
1067
+ size_t cacheAndSharedTotal = sharedMemPerMultiprocessorHigh + minCacheSize;
1068
+ size_t sharedMemPerMultiprocessorLow = cacheAndSharedTotal - maxCacheSize;
1069
+
1070
+ switch (properties->computeMajor) {
1071
+ case 3:
1072
+ // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
1073
+ // is shared memory.
1074
+ //
1075
+ switch (cacheConfig) {
1076
+ default :
1077
+ case CACHE_PREFER_NONE:
1078
+ case CACHE_PREFER_SHARED:
1079
+ bytes = sharedMemPerMultiprocessorHigh;
1080
+ break;
1081
+ case CACHE_PREFER_L1:
1082
+ bytes = sharedMemPerMultiprocessorLow;
1083
+ break;
1084
+ case CACHE_PREFER_EQUAL:
1085
+ // Equal is the mid-point between high and low. It should be
1086
+ // equivalent to low + 16KB.
1087
+ //
1088
+ bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
1089
+ break;
1090
+ }
1091
+ break;
1092
+ case 5:
1093
+ case 6:
1094
+ // Maxwell and Pascal have dedicated shared memory.
1095
+ //
1096
+ bytes = sharedMemPerMultiprocessorHigh;
1097
+ break;
1098
+ default:
1099
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
1100
+ }
1101
+
1102
+ *limit = bytes;
1103
+
1104
+ return CUDA_OCC_SUCCESS;
1105
+ }
1106
+
1107
+ /**
1108
+ * Shared memory based on config requested by User
1109
+ */
1110
+ static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
1111
+ {
1112
+ // Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference,
1113
+ // it is handled separately from the cache config preference.
1114
+ if (properties->computeMajor >= 7) {
1115
+ return cudaOccSMemPreferenceVoltaPlus(limit, properties, state);
1116
+ }
1117
+ return cudaOccSMemPreference(limit, properties, state);
1118
+ }
1119
+
1120
+ /**
1121
+ * Return the per block shared memory limit based on function config
1122
+ */
1123
+ static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta)
1124
+ {
1125
+ switch (properties->computeMajor) {
1126
+ case 2:
1127
+ case 3:
1128
+ case 4:
1129
+ case 5:
1130
+ case 6:
1131
+ *limit = properties->sharedMemPerBlock;
1132
+ break;
1133
+ case 7:
1134
+ case 8:
1135
+ case 9:
1136
+ case 10:
1137
+ case 12:
1138
+ switch (shmemLimitConfig) {
1139
+ default:
1140
+ case FUNC_SHMEM_LIMIT_DEFAULT:
1141
+ *limit = properties->sharedMemPerBlock;
1142
+ break;
1143
+ case FUNC_SHMEM_LIMIT_OPTIN:
1144
+ if (smemPerCta > properties->sharedMemPerBlock) {
1145
+ *limit = properties->sharedMemPerBlockOptin;
1146
+ }
1147
+ else {
1148
+ *limit = properties->sharedMemPerBlock;
1149
+ }
1150
+ break;
1151
+ }
1152
+ break;
1153
+ default:
1154
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
1155
+ }
1156
+
1157
+ // Starting Ampere, CUDA driver reserves additional shared memory per block
1158
+ if (properties->computeMajor >= 8) {
1159
+ *limit += properties->reservedSharedMemPerBlock;
1160
+ }
1161
+
1162
+ return CUDA_OCC_SUCCESS;
1163
+ }
1164
+
1165
+ /**
1166
+ * Partitioned global caching mode support
1167
+ */
1168
+ static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties)
1169
+ {
1170
+ *limit = PARTITIONED_GC_NOT_SUPPORTED;
1171
+
1172
+ if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) ||
1173
+ properties->computeMajor == 6) {
1174
+ *limit = PARTITIONED_GC_SUPPORTED;
1175
+ }
1176
+
1177
+ if (properties->computeMajor == 6 && properties->computeMinor == 0) {
1178
+ *limit = PARTITIONED_GC_NOT_SUPPORTED;
1179
+ }
1180
+
1181
+ return CUDA_OCC_SUCCESS;
1182
+ }
1183
+
1184
+ ///////////////////////////////////////////////
1185
+ // User Input Sanity //
1186
+ ///////////////////////////////////////////////
1187
+
1188
+ static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties)
1189
+ {
1190
+ // Verify device properties
1191
+ //
1192
+ // Each of these limits must be a positive number.
1193
+ //
1194
+ // Compute capacity is checked during the occupancy calculation
1195
+ //
1196
+ if (properties->maxThreadsPerBlock <= 0 ||
1197
+ properties->maxThreadsPerMultiprocessor <= 0 ||
1198
+ properties->regsPerBlock <= 0 ||
1199
+ properties->regsPerMultiprocessor <= 0 ||
1200
+ properties->warpSize <= 0 ||
1201
+ properties->sharedMemPerBlock <= 0 ||
1202
+ properties->sharedMemPerMultiprocessor <= 0 ||
1203
+ properties->numSms <= 0) {
1204
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1205
+ }
1206
+
1207
+ return CUDA_OCC_SUCCESS;
1208
+ }
1209
+
1210
+ static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes)
1211
+ {
1212
+ // Verify function attributes
1213
+ //
1214
+ if (attributes->maxThreadsPerBlock <= 0 ||
1215
+ attributes->numRegs < 0) { // Compiler may choose not to use
1216
+ // any register (empty kernels,
1217
+ // etc.)
1218
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1219
+ }
1220
+
1221
+ return CUDA_OCC_SUCCESS;
1222
+ }
1223
+
1224
+ static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state)
1225
+ {
1226
+ (void)state; // silence unused-variable warning
1227
+ // Placeholder
1228
+ //
1229
+
1230
+ return CUDA_OCC_SUCCESS;
1231
+ }
1232
+
1233
+ static __OCC_INLINE cudaOccError cudaOccInputCheck(
1234
+ const cudaOccDeviceProp *properties,
1235
+ const cudaOccFuncAttributes *attributes,
1236
+ const cudaOccDeviceState *state)
1237
+ {
1238
+ cudaOccError status = CUDA_OCC_SUCCESS;
1239
+
1240
+ status = cudaOccDevicePropCheck(properties);
1241
+ if (status != CUDA_OCC_SUCCESS) {
1242
+ return status;
1243
+ }
1244
+
1245
+ status = cudaOccFuncAttributesCheck(attributes);
1246
+ if (status != CUDA_OCC_SUCCESS) {
1247
+ return status;
1248
+ }
1249
+
1250
+ status = cudaOccDeviceStateCheck(state);
1251
+ if (status != CUDA_OCC_SUCCESS) {
1252
+ return status;
1253
+ }
1254
+
1255
+ return status;
1256
+ }
1257
+
1258
+ ///////////////////////////////////////////////
1259
+ // Occupancy calculation Functions //
1260
+ ///////////////////////////////////////////////
1261
+
1262
+ static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
1263
+ const cudaOccDeviceProp *properties,
1264
+ const cudaOccFuncAttributes *attributes)
1265
+ {
1266
+ cudaOccPartitionedGCSupport gcSupport;
1267
+ cudaOccPartitionedGCConfig gcConfig;
1268
+
1269
+ cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties);
1270
+
1271
+ gcConfig = attributes->partitionedGCConfig;
1272
+
1273
+ if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) {
1274
+ gcConfig = PARTITIONED_GC_OFF;
1275
+ }
1276
+
1277
+ return gcConfig;
1278
+ }
1279
+
1280
+ // Warp limit
1281
+ //
1282
+ static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit(
1283
+ int *limit,
1284
+ cudaOccPartitionedGCConfig gcConfig,
1285
+ const cudaOccDeviceProp *properties,
1286
+ const cudaOccFuncAttributes *attributes,
1287
+ int blockSize)
1288
+ {
1289
+ cudaOccError status = CUDA_OCC_SUCCESS;
1290
+ int maxWarpsPerSm;
1291
+ int warpsAllocatedPerCTA;
1292
+ int maxBlocks;
1293
+ (void)attributes; // silence unused-variable warning
1294
+
1295
+ if (blockSize > properties->maxThreadsPerBlock) {
1296
+ maxBlocks = 0;
1297
+ }
1298
+ else {
1299
+ maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize;
1300
+ warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
1301
+ maxBlocks = 0;
1302
+
1303
+ if (gcConfig != PARTITIONED_GC_OFF) {
1304
+ int maxBlocksPerSmPartition;
1305
+ int maxWarpsPerSmPartition;
1306
+
1307
+ // If partitioned global caching is on, then a CTA can only use a SM
1308
+ // partition (a half SM), and thus a half of the warp slots
1309
+ // available per SM
1310
+ //
1311
+ maxWarpsPerSmPartition = maxWarpsPerSm / 2;
1312
+ maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
1313
+ maxBlocks = maxBlocksPerSmPartition * 2;
1314
+ }
1315
+ // On hardware that supports partitioned global caching, each half SM is
1316
+ // guaranteed to support at least 32 warps (maximum number of warps of a
1317
+ // CTA), so caching will not cause 0 occupancy due to insufficient warp
1318
+ // allocation slots.
1319
+ //
1320
+ else {
1321
+ maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
1322
+ }
1323
+ }
1324
+
1325
+ *limit = maxBlocks;
1326
+
1327
+ return status;
1328
+ }
1329
+
1330
+ // Shared memory limit
1331
+ //
1332
+ static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit(
1333
+ int *limit,
1334
+ cudaOccResult *result,
1335
+ const cudaOccDeviceProp *properties,
1336
+ const cudaOccFuncAttributes *attributes,
1337
+ const cudaOccDeviceState *state,
1338
+ int blockSize,
1339
+ size_t dynamicSmemSize)
1340
+ {
1341
+ cudaOccError status = CUDA_OCC_SUCCESS;
1342
+ int allocationGranularity;
1343
+ size_t userSmemPreference = 0;
1344
+ size_t totalSmemUsagePerCTA;
1345
+ size_t maxSmemUsagePerCTA;
1346
+ size_t smemAllocatedPerCTA;
1347
+ size_t staticSmemSize;
1348
+ size_t sharedMemPerMultiprocessor;
1349
+ size_t smemLimitPerCTA;
1350
+ int maxBlocks;
1351
+ int dynamicSmemSizeExceeded = 0;
1352
+ int totalSmemSizeExceeded = 0;
1353
+ (void)blockSize; // silence unused-variable warning
1354
+
1355
+ status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
1356
+ if (status != CUDA_OCC_SUCCESS) {
1357
+ return status;
1358
+ }
1359
+
1360
+ // Obtain the user preferred shared memory size. This setting is ignored if
1361
+ // user requests more shared memory than preferred.
1362
+ //
1363
+ status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
1364
+ if (status != CUDA_OCC_SUCCESS) {
1365
+ return status;
1366
+ }
1367
+
1368
+ staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock;
1369
+ totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize;
1370
+ smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);
1371
+
1372
+ maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes;
1373
+
1374
+ dynamicSmemSizeExceeded = 0;
1375
+ totalSmemSizeExceeded = 0;
1376
+
1377
+ // Obtain the user set maximum dynamic size if it exists
1378
+ // If so, the current launch dynamic shared memory must not
1379
+ // exceed the set limit
1380
+ if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT &&
1381
+ dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) {
1382
+ dynamicSmemSizeExceeded = 1;
1383
+ }
1384
+
1385
+ status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA);
1386
+ if (status != CUDA_OCC_SUCCESS) {
1387
+ return status;
1388
+ }
1389
+
1390
+ if (smemAllocatedPerCTA > smemLimitPerCTA) {
1391
+ totalSmemSizeExceeded = 1;
1392
+ }
1393
+
1394
+ if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) {
1395
+ maxBlocks = 0;
1396
+ }
1397
+ else {
1398
+ // User requested shared memory limit is used as long as it is greater
1399
+ // than the total shared memory used per CTA, i.e. as long as at least
1400
+ // one CTA can be launched.
1401
+ if (userSmemPreference >= smemAllocatedPerCTA) {
1402
+ sharedMemPerMultiprocessor = userSmemPreference;
1403
+ }
1404
+ else {
1405
+ // On Volta+, user requested shared memory will limit occupancy
1406
+ // if it's less than shared memory per CTA. Otherwise, the
1407
+ // maximum shared memory limit is used.
1408
+ if (properties->computeMajor >= 7) {
1409
+ sharedMemPerMultiprocessor = smemAllocatedPerCTA;
1410
+ status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties);
1411
+ if (status != CUDA_OCC_SUCCESS) {
1412
+ return status;
1413
+ }
1414
+ }
1415
+ else {
1416
+ sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor;
1417
+ }
1418
+ }
1419
+
1420
+ if (smemAllocatedPerCTA > 0) {
1421
+ maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
1422
+ }
1423
+ else {
1424
+ maxBlocks = INT_MAX;
1425
+ }
1426
+ }
1427
+
1428
+ result->allocatedSharedMemPerBlock = smemAllocatedPerCTA;
1429
+
1430
+ *limit = maxBlocks;
1431
+
1432
+ return status;
1433
+ }
1434
+
1435
+ static __OCC_INLINE
1436
+ cudaOccError cudaOccMaxBlocksPerSMRegsLimit(
1437
+ int *limit,
1438
+ cudaOccPartitionedGCConfig *gcConfig,
1439
+ cudaOccResult *result,
1440
+ const cudaOccDeviceProp *properties,
1441
+ const cudaOccFuncAttributes *attributes,
1442
+ int blockSize)
1443
+ {
1444
+ cudaOccError status = CUDA_OCC_SUCCESS;
1445
+ int allocationGranularity;
1446
+ int warpsAllocatedPerCTA;
1447
+ int regsAllocatedPerCTA;
1448
+ int regsAssumedPerCTA;
1449
+ int regsPerWarp;
1450
+ int regsAllocatedPerWarp;
1451
+ int numSubPartitions;
1452
+ int numRegsPerSubPartition;
1453
+ int numWarpsPerSubPartition;
1454
+ int numWarpsPerSM;
1455
+ int maxBlocks;
1456
+ int maxRegsPerThread;
1457
+
1458
+ status = cudaOccRegAllocationGranularity(
1459
+ &allocationGranularity,
1460
+ properties);
1461
+ if (status != CUDA_OCC_SUCCESS) {
1462
+ return status;
1463
+ }
1464
+
1465
+ status = cudaOccRegAllocationMaxPerThread(
1466
+ &maxRegsPerThread,
1467
+ properties);
1468
+ if (status != CUDA_OCC_SUCCESS) {
1469
+ return status;
1470
+ }
1471
+
1472
+ status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties);
1473
+ if (status != CUDA_OCC_SUCCESS) {
1474
+ return status;
1475
+ }
1476
+
1477
+ warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
1478
+
1479
+ // GPUs of compute capability 2.x and higher allocate registers to warps
1480
+ //
1481
+ // Number of regs per warp is regs per thread x warp size, rounded up to
1482
+ // register allocation granularity
1483
+ //
1484
+ regsPerWarp = attributes->numRegs * properties->warpSize;
1485
+ regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
1486
+ regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA;
1487
+
1488
+ // Hardware verifies if a launch fits the per-CTA register limit. For
1489
+ // historical reasons, the verification logic assumes register
1490
+ // allocations are made to all partitions simultaneously. Therefore, to
1491
+ // simulate the hardware check, the warp allocation needs to be rounded
1492
+ // up to the number of partitions.
1493
+ //
1494
+ regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);
1495
+
1496
+ if (properties->regsPerBlock < regsAssumedPerCTA || // Hardware check
1497
+ properties->regsPerBlock < regsAllocatedPerCTA || // Software check
1498
+ attributes->numRegs > maxRegsPerThread) { // Per thread limit check
1499
+ maxBlocks = 0;
1500
+ }
1501
+ else {
1502
+ if (regsAllocatedPerWarp > 0) {
1503
+ // Registers are allocated in each sub-partition. The max number
1504
+ // of warps that can fit on an SM is equal to the max number of
1505
+ // warps per sub-partition x number of sub-partitions.
1506
+ //
1507
+ numRegsPerSubPartition = properties->regsPerMultiprocessor / numSubPartitions;
1508
+ numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;
1509
+
1510
+ maxBlocks = 0;
1511
+
1512
+ if (*gcConfig != PARTITIONED_GC_OFF) {
1513
+ int numSubPartitionsPerSmPartition;
1514
+ int numWarpsPerSmPartition;
1515
+ int maxBlocksPerSmPartition;
1516
+
1517
+ // If partitioned global caching is on, then a CTA can only
1518
+ // use a half SM, and thus a half of the registers available
1519
+ // per SM
1520
+ //
1521
+ numSubPartitionsPerSmPartition = numSubPartitions / 2;
1522
+ numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
1523
+ maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA;
1524
+ maxBlocks = maxBlocksPerSmPartition * 2;
1525
+ }
1526
+
1527
+ // Try again if partitioned global caching is not enabled, or if
1528
+ // the CTA cannot fit on the SM with caching on (maxBlocks == 0). In the latter
1529
+ // case, the device will automatically turn off caching, except
1530
+ // if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate
1531
+ // occupancy and launch configuration.
1532
+ //
1533
+ if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) {
1534
+ // In case *gcConfig was PARTITIONED_GC_ON flip it OFF since
1535
+ // this is what it will be if we spread CTA across partitions.
1536
+ //
1537
+ *gcConfig = PARTITIONED_GC_OFF;
1538
+ numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
1539
+ maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA;
1540
+ }
1541
+ }
1542
+ else {
1543
+ maxBlocks = INT_MAX;
1544
+ }
1545
+ }
1546
+
1547
+
1548
+ result->allocatedRegistersPerBlock = regsAllocatedPerCTA;
1549
+
1550
+ *limit = maxBlocks;
1551
+
1552
+ return status;
1553
+ }
1554
+
1555
+ // Barrier limit
1556
+ //
1557
+ static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit(
1558
+ int *limit,
1559
+ int ctaLimitBlocks,
1560
+ const cudaOccDeviceProp *properties,
1561
+ const cudaOccFuncAttributes *attributes)
1562
+ {
1563
+ cudaOccError status = CUDA_OCC_SUCCESS;
1564
+ int numBarriersAvailable = 0;
1565
+ int numBarriersUsed = attributes->numBlockBarriers;
1566
+ int maxBlocks = INT_MAX;
1567
+
1568
+ switch(properties->computeMajor) {
1569
+ case 5:
1570
+ case 6:
1571
+ case 7:
1572
+ numBarriersAvailable = ctaLimitBlocks * 2;
1573
+ break;
1574
+ case 8:
1575
+ if (properties->computeMinor == 0) {
1576
+ numBarriersAvailable = ctaLimitBlocks * 2;
1577
+ }
1578
+ else {
1579
+ numBarriersAvailable = ctaLimitBlocks;
1580
+ }
1581
+ break;
1582
+ case 9:
1583
+ numBarriersAvailable = ctaLimitBlocks * 2;
1584
+ break;
1585
+ case 10:
1586
+ switch(properties->computeMinor) {
1587
+ case 1 :
1588
+ numBarriersAvailable = ctaLimitBlocks;
1589
+ break;
1590
+ case 0 : /* explicitly added to avoid build failure in WDDM driver components. */
1591
+ default :
1592
+ numBarriersAvailable = ctaLimitBlocks * 2;
1593
+ }
1594
+
1595
+ break;
1596
+ case 12:
1597
+ numBarriersAvailable = ctaLimitBlocks;
1598
+ break;
1599
+ default:
1600
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
1601
+ }
1602
+
1603
+ if (numBarriersUsed) {
1604
+ maxBlocks = numBarriersAvailable / numBarriersUsed;
1605
+ }
1606
+
1607
+ *limit = maxBlocks;
1608
+
1609
+ return status;
1610
+ }
1611
+
1612
+ ///////////////////////////////////
1613
+ // API Implementations //
1614
+ ///////////////////////////////////
1615
+
1616
+ static __OCC_INLINE
1617
+ cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
1618
+ cudaOccResult *result,
1619
+ const cudaOccDeviceProp *properties,
1620
+ const cudaOccFuncAttributes *attributes,
1621
+ const cudaOccDeviceState *state,
1622
+ int blockSize,
1623
+ size_t dynamicSmemSize)
1624
+ {
1625
+ cudaOccError status = CUDA_OCC_SUCCESS;
1626
+ int ctaLimitWarps = 0;
1627
+ int ctaLimitBlocks = 0;
1628
+ int ctaLimitSMem = 0;
1629
+ int ctaLimitRegs = 0;
1630
+ int ctaLimitBars = 0;
1631
+ int ctaLimit = 0;
1632
+ unsigned int limitingFactors = 0;
1633
+
1634
+ cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF;
1635
+
1636
+ if (!result || !properties || !attributes || !state || blockSize <= 0) {
1637
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1638
+ }
1639
+
1640
+ ///////////////////////////
1641
+ // Check user input
1642
+ ///////////////////////////
1643
+
1644
+ status = cudaOccInputCheck(properties, attributes, state);
1645
+ if (status != CUDA_OCC_SUCCESS) {
1646
+ return status;
1647
+ }
1648
+
1649
+ ///////////////////////////
1650
+ // Initialization
1651
+ ///////////////////////////
1652
+
1653
+ gcConfig = cudaOccPartitionedGCExpected(properties, attributes);
1654
+
1655
+ ///////////////////////////
1656
+ // Compute occupancy
1657
+ ///////////////////////////
1658
+
1659
+ // Limits due to registers/SM
1660
+ // Also compute if partitioned global caching has to be turned off
1661
+ //
1662
+ status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize);
1663
+ if (status != CUDA_OCC_SUCCESS) {
1664
+ return status;
1665
+ }
1666
+
1667
+ // SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4.
1668
+ // As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x.
1669
+ // For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0),
1670
+ // we do not let it run on any Pascal processor, even though it may be able to run on GP100.
1671
+ // Therefore, we check the occupancy on GP10x when it can run on GP100
1672
+ //
1673
+ if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) {
1674
+ cudaOccDeviceProp propertiesGP10x;
1675
+ cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig;
1676
+ int ctaLimitRegsGP10x = 0;
1677
+
1678
+ // Set up properties for GP10x
1679
+ memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x));
1680
+ propertiesGP10x.computeMinor = 1;
1681
+
1682
+ status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize);
1683
+ if (status != CUDA_OCC_SUCCESS) {
1684
+ return status;
1685
+ }
1686
+
1687
+ if (ctaLimitRegsGP10x == 0) {
1688
+ ctaLimitRegs = 0;
1689
+ }
1690
+ }
1691
+
1692
+ // Limits due to warps/SM
1693
+ //
1694
+ status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize);
1695
+ if (status != CUDA_OCC_SUCCESS) {
1696
+ return status;
1697
+ }
1698
+
1699
+ // Limits due to blocks/SM
1700
+ //
1701
+ status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties);
1702
+ if (status != CUDA_OCC_SUCCESS) {
1703
+ return status;
1704
+ }
1705
+
1706
+ // Limits due to shared memory/SM
1707
+ //
1708
+ status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize);
1709
+ if (status != CUDA_OCC_SUCCESS) {
1710
+ return status;
1711
+ }
1712
+
1713
+ ///////////////////////////
1714
+ // Overall occupancy
1715
+ ///////////////////////////
1716
+
1717
+ // Overall limit is min() of limits due to above reasons
1718
+ //
1719
+ ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));
1720
+
1721
+ // Determine occupancy limiting factors
1722
+ //
1723
+ if (ctaLimit == ctaLimitWarps) {
1724
+ limitingFactors |= OCC_LIMIT_WARPS;
1725
+ }
1726
+ if (ctaLimit == ctaLimitRegs) {
1727
+ limitingFactors |= OCC_LIMIT_REGISTERS;
1728
+ }
1729
+ if (ctaLimit == ctaLimitSMem) {
1730
+ limitingFactors |= OCC_LIMIT_SHARED_MEMORY;
1731
+ }
1732
+ if (ctaLimit == ctaLimitBlocks) {
1733
+ limitingFactors |= OCC_LIMIT_BLOCKS;
1734
+ }
1735
+
1736
+ // For Hopper onwards compute the limits to occupancy based on block barrier count
1737
+ //
1738
+ if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) {
1739
+ // Limits due to barrier/SM
1740
+ //
1741
+ status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, properties, attributes);
1742
+ if (status != CUDA_OCC_SUCCESS) {
1743
+ return status;
1744
+ }
1745
+
1746
+ // Recompute overall limit based on barrier/SM
1747
+ //
1748
+ ctaLimit = __occMin(ctaLimitBars, ctaLimit);
1749
+
1750
+ // Determine if this is occupancy limiting factor
1751
+ //
1752
+ if (ctaLimit == ctaLimitBars) {
1753
+ limitingFactors |= OCC_LIMIT_BARRIERS;
1754
+ }
1755
+ }
1756
+ else {
1757
+ ctaLimitBars = INT_MAX;
1758
+ }
1759
+
1760
+ // Fill in the return values
1761
+ //
1762
+ result->limitingFactors = limitingFactors;
1763
+
1764
+ result->blockLimitRegs = ctaLimitRegs;
1765
+ result->blockLimitSharedMem = ctaLimitSMem;
1766
+ result->blockLimitWarps = ctaLimitWarps;
1767
+ result->blockLimitBlocks = ctaLimitBlocks;
1768
+ result->blockLimitBarriers = ctaLimitBars;
1769
+ result->partitionedGCConfig = gcConfig;
1770
+
1771
+ // Final occupancy
1772
+ result->activeBlocksPerMultiprocessor = ctaLimit;
1773
+
1774
+ return CUDA_OCC_SUCCESS;
1775
+ }
1776
+
1777
+ static __OCC_INLINE
1778
+ cudaOccError cudaOccAvailableDynamicSMemPerBlock(
1779
+ size_t *bytesAvailable,
1780
+ const cudaOccDeviceProp *properties,
1781
+ const cudaOccFuncAttributes *attributes,
1782
+ const cudaOccDeviceState *state,
1783
+ int numBlocks,
1784
+ int blockSize)
1785
+ {
1786
+ int allocationGranularity;
1787
+ size_t smemLimitPerBlock;
1788
+ size_t smemAvailableForDynamic;
1789
+ size_t userSmemPreference = 0;
1790
+ size_t sharedMemPerMultiprocessor;
1791
+ cudaOccResult result;
1792
+ cudaOccError status = CUDA_OCC_SUCCESS;
1793
+
1794
+ if (numBlocks <= 0)
1795
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1796
+
1797
+ // First compute occupancy of potential kernel launch.
1798
+ //
1799
+ status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0);
1800
+ if (status != CUDA_OCC_SUCCESS) {
1801
+ return status;
1802
+ }
1803
+ // Check if occupancy is achievable given user requested number of blocks.
1804
+ //
1805
+ if (result.activeBlocksPerMultiprocessor < numBlocks) {
1806
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1807
+ }
1808
+
1809
+ status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
1810
+ if (status != CUDA_OCC_SUCCESS) {
1811
+ return status;
1812
+ }
1813
+
1814
+ // Return the per block shared memory limit based on function config.
1815
+ //
1816
+ status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor);
1817
+ if (status != CUDA_OCC_SUCCESS) {
1818
+ return status;
1819
+ }
1820
+
1821
+ // If there is only a single block needed per SM, then the user preference can be ignored and the fully SW
1822
+ // limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user
1823
+ // preference sets the total limit of available shared memory.
1824
+ //
1825
+ cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
1826
+ if (numBlocks == 1) {
1827
+ sharedMemPerMultiprocessor = smemLimitPerBlock;
1828
+ }
1829
+ else {
1830
+ if (!userSmemPreference) {
1831
+ userSmemPreference = 1 ;
1832
+ status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties);
1833
+ if (status != CUDA_OCC_SUCCESS) {
1834
+ return status;
1835
+ }
1836
+ }
1837
+ sharedMemPerMultiprocessor = userSmemPreference;
1838
+ }
1839
+
1840
+ // Compute total shared memory available per SM
1841
+ //
1842
+ smemAvailableForDynamic = sharedMemPerMultiprocessor / numBlocks;
1843
+ smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity;
1844
+
1845
+ // Cap shared memory
1846
+ //
1847
+ if (smemAvailableForDynamic > smemLimitPerBlock) {
1848
+ smemAvailableForDynamic = smemLimitPerBlock;
1849
+ }
1850
+
1851
+ // Now compute dynamic shared memory size
1852
+ smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes;
1853
+
1854
+ // Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute()
1855
+ //
1856
+ if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes)
1857
+ smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes;
1858
+
1859
+ *bytesAvailable = smemAvailableForDynamic;
1860
+ return CUDA_OCC_SUCCESS;
1861
+ }
1862
+
1863
+ static __OCC_INLINE
1864
+ cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
1865
+ int *minGridSize,
1866
+ int *blockSize,
1867
+ const cudaOccDeviceProp *properties,
1868
+ const cudaOccFuncAttributes *attributes,
1869
+ const cudaOccDeviceState *state,
1870
+ size_t (*blockSizeToDynamicSMemSize)(int),
1871
+ size_t dynamicSMemSize)
1872
+ {
1873
+ cudaOccError status = CUDA_OCC_SUCCESS;
1874
+ cudaOccResult result;
1875
+
1876
+ // Limits
1877
+ int occupancyLimit;
1878
+ int granularity;
1879
+ int blockSizeLimit;
1880
+
1881
+ // Recorded maximum
1882
+ int maxBlockSize = 0;
1883
+ int numBlocks = 0;
1884
+ int maxOccupancy = 0;
1885
+
1886
+ // Temporary
1887
+ int blockSizeToTryAligned;
1888
+ int blockSizeToTry;
1889
+ int blockSizeLimitAligned;
1890
+ int occupancyInBlocks;
1891
+ int occupancyInThreads;
1892
+
1893
+ ///////////////////////////
1894
+ // Check user input
1895
+ ///////////////////////////
1896
+
1897
+ if (!minGridSize || !blockSize || !properties || !attributes || !state) {
1898
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1899
+ }
1900
+
1901
+ status = cudaOccInputCheck(properties, attributes, state);
1902
+ if (status != CUDA_OCC_SUCCESS) {
1903
+ return status;
1904
+ }
1905
+
1906
+ /////////////////////////////////////////////////////////////////////////////////
1907
+ // Try each block size, and pick the block size with maximum occupancy
1908
+ /////////////////////////////////////////////////////////////////////////////////
1909
+
1910
+ occupancyLimit = properties->maxThreadsPerMultiprocessor;
1911
+ granularity = properties->warpSize;
1912
+
1913
+ blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
1914
+ blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
1915
+
1916
+ for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
1917
+ blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
1918
+
1919
+ // Ignore dynamicSMemSize if the user provides a mapping
1920
+ //
1921
+ if (blockSizeToDynamicSMemSize) {
1922
+ dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry);
1923
+ }
1924
+
1925
+ status = cudaOccMaxActiveBlocksPerMultiprocessor(
1926
+ &result,
1927
+ properties,
1928
+ attributes,
1929
+ state,
1930
+ blockSizeToTry,
1931
+ dynamicSMemSize);
1932
+
1933
+ if (status != CUDA_OCC_SUCCESS) {
1934
+ return status;
1935
+ }
1936
+
1937
+ occupancyInBlocks = result.activeBlocksPerMultiprocessor;
1938
+ occupancyInThreads = blockSizeToTry * occupancyInBlocks;
1939
+
1940
+ if (occupancyInThreads > maxOccupancy) {
1941
+ maxBlockSize = blockSizeToTry;
1942
+ numBlocks = occupancyInBlocks;
1943
+ maxOccupancy = occupancyInThreads;
1944
+ }
1945
+
1946
+ // Early out if we have reached the maximum
1947
+ //
1948
+ if (occupancyLimit == maxOccupancy) {
1949
+ break;
1950
+ }
1951
+ }
1952
+
1953
+ ///////////////////////////
1954
+ // Return best available
1955
+ ///////////////////////////
1956
+
1957
+ // Suggested min grid size to achieve a full machine launch
1958
+ //
1959
+ *minGridSize = numBlocks * properties->numSms;
1960
+ *blockSize = maxBlockSize;
1961
+
1962
+ return status;
1963
+ }
1964
+
1965
+
1966
+ #if defined(__cplusplus)
1967
+
1968
+ namespace {
1969
+
1970
+ __OCC_INLINE
1971
+ cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
1972
+ int *minGridSize,
1973
+ int *blockSize,
1974
+ const cudaOccDeviceProp *properties,
1975
+ const cudaOccFuncAttributes *attributes,
1976
+ const cudaOccDeviceState *state,
1977
+ size_t dynamicSMemSize)
1978
+ {
1979
+ return cudaOccMaxPotentialOccupancyBlockSize(
1980
+ minGridSize,
1981
+ blockSize,
1982
+ properties,
1983
+ attributes,
1984
+ state,
1985
+ NULL,
1986
+ dynamicSMemSize);
1987
+ }
1988
+
1989
+ template <typename UnaryFunction>
1990
+ __OCC_INLINE
1991
+ cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
1992
+ int *minGridSize,
1993
+ int *blockSize,
1994
+ const cudaOccDeviceProp *properties,
1995
+ const cudaOccFuncAttributes *attributes,
1996
+ const cudaOccDeviceState *state,
1997
+ UnaryFunction blockSizeToDynamicSMemSize)
1998
+ {
1999
+ cudaOccError status = CUDA_OCC_SUCCESS;
2000
+ cudaOccResult result;
2001
+
2002
+ // Limits
2003
+ int occupancyLimit;
2004
+ int granularity;
2005
+ int blockSizeLimit;
2006
+
2007
+ // Recorded maximum
2008
+ int maxBlockSize = 0;
2009
+ int numBlocks = 0;
2010
+ int maxOccupancy = 0;
2011
+
2012
+ // Temporary
2013
+ int blockSizeToTryAligned;
2014
+ int blockSizeToTry;
2015
+ int blockSizeLimitAligned;
2016
+ int occupancyInBlocks;
2017
+ int occupancyInThreads;
2018
+ size_t dynamicSMemSize;
2019
+
2020
+ ///////////////////////////
2021
+ // Check user input
2022
+ ///////////////////////////
2023
+
2024
+ if (!minGridSize || !blockSize || !properties || !attributes || !state) {
2025
+ return CUDA_OCC_ERROR_INVALID_INPUT;
2026
+ }
2027
+
2028
+ status = cudaOccInputCheck(properties, attributes, state);
2029
+ if (status != CUDA_OCC_SUCCESS) {
2030
+ return status;
2031
+ }
2032
+
2033
+ /////////////////////////////////////////////////////////////////////////////////
2034
+ // Try each block size, and pick the block size with maximum occupancy
2035
+ /////////////////////////////////////////////////////////////////////////////////
2036
+
2037
+ occupancyLimit = properties->maxThreadsPerMultiprocessor;
2038
+ granularity = properties->warpSize;
2039
+ blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
2040
+ blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
2041
+
2042
+ for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
2043
+ blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
2044
+
2045
+ dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
2046
+
2047
+ status = cudaOccMaxActiveBlocksPerMultiprocessor(
2048
+ &result,
2049
+ properties,
2050
+ attributes,
2051
+ state,
2052
+ blockSizeToTry,
2053
+ dynamicSMemSize);
2054
+
2055
+ if (status != CUDA_OCC_SUCCESS) {
2056
+ return status;
2057
+ }
2058
+
2059
+ occupancyInBlocks = result.activeBlocksPerMultiprocessor;
2060
+
2061
+ occupancyInThreads = blockSizeToTry * occupancyInBlocks;
2062
+
2063
+ if (occupancyInThreads > maxOccupancy) {
2064
+ maxBlockSize = blockSizeToTry;
2065
+ numBlocks = occupancyInBlocks;
2066
+ maxOccupancy = occupancyInThreads;
2067
+ }
2068
+
2069
+ // Early out if we have reached the maximum
2070
+ //
2071
+ if (occupancyLimit == maxOccupancy) {
2072
+ break;
2073
+ }
2074
+ }
2075
+
2076
+ ///////////////////////////
2077
+ // Return best available
2078
+ ///////////////////////////
2079
+
2080
+ // Suggested min grid size to achieve a full machine launch
2081
+ //
2082
+ *minGridSize = numBlocks * properties->numSms;
2083
+ *blockSize = maxBlockSize;
2084
+
2085
+ return status;
2086
+ }
2087
+
2088
+ } // namespace anonymous
2089
+
2090
+ #endif /*__cplusplus */
2091
+
2092
+ #undef __OCC_INLINE
2093
+
2094
+ #endif /*__cuda_occupancy_h__*/
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cuda_pipeline_primitives.h ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CUDA_PIPELINE_PRIMITIVES_H_
51
+ # define _CUDA_PIPELINE_PRIMITIVES_H_
52
+
53
+ # include "cuda_pipeline_helpers.h"
54
+
55
+ _CUDA_PIPELINE_STATIC_QUALIFIER
56
+ void __pipeline_memcpy_async(void* __restrict__ dst_shared, const void* __restrict__ src_global, size_t size_and_align,
57
+ size_t zfill = 0)
58
+ {
59
+ _CUDA_PIPELINE_ASSERT(size_and_align == 4 || size_and_align == 8 || size_and_align == 16);
60
+ _CUDA_PIPELINE_ASSERT(zfill <= size_and_align);
61
+ _CUDA_PIPELINE_ASSERT(__isShared(dst_shared));
62
+ _CUDA_PIPELINE_ASSERT(__isGlobal(src_global));
63
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst_shared) & (size_and_align - 1)));
64
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src_global) & (size_and_align - 1)));
65
+
66
+ switch (size_and_align) {
67
+ case 16:
68
+ switch (zfill) {
69
+ case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 16>(dst_shared, src_global); return;
70
+ case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 15>(dst_shared, src_global); return;
71
+ case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 14>(dst_shared, src_global); return;
72
+ case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 13>(dst_shared, src_global); return;
73
+ case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 12>(dst_shared, src_global); return;
74
+ case 5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 11>(dst_shared, src_global); return;
75
+ case 6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 10>(dst_shared, src_global); return;
76
+ case 7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 9>(dst_shared, src_global); return;
77
+ case 8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 8>(dst_shared, src_global); return;
78
+ case 9: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 7>(dst_shared, src_global); return;
79
+ case 10: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 6>(dst_shared, src_global); return;
80
+ case 11: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 5>(dst_shared, src_global); return;
81
+ case 12: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 4>(dst_shared, src_global); return;
82
+ case 13: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 3>(dst_shared, src_global); return;
83
+ case 14: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 2>(dst_shared, src_global); return;
84
+ case 15: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 1>(dst_shared, src_global); return;
85
+ case 16: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async<16, 0>(dst_shared, src_global); return;
86
+ default: _CUDA_PIPELINE_ABORT(); return;
87
+ }
88
+ case 8:
89
+ switch (zfill) {
90
+ case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 8>(dst_shared, src_global); return;
91
+ case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 7>(dst_shared, src_global); return;
92
+ case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 6>(dst_shared, src_global); return;
93
+ case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 5>(dst_shared, src_global); return;
94
+ case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 4>(dst_shared, src_global); return;
95
+ case 5: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 3>(dst_shared, src_global); return;
96
+ case 6: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 2>(dst_shared, src_global); return;
97
+ case 7: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 1>(dst_shared, src_global); return;
98
+ case 8: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 8, 0>(dst_shared, src_global); return;
99
+ default: _CUDA_PIPELINE_ABORT(); return;
100
+ }
101
+ case 4:
102
+ switch (zfill) {
103
+ case 0: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 4>(dst_shared, src_global); return;
104
+ case 1: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 3>(dst_shared, src_global); return;
105
+ case 2: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 2>(dst_shared, src_global); return;
106
+ case 3: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 1>(dst_shared, src_global); return;
107
+ case 4: _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_memcpy_async< 4, 0>(dst_shared, src_global); return;
108
+ default: _CUDA_PIPELINE_ABORT(); return;
109
+ }
110
+ default:
111
+ _CUDA_PIPELINE_ABORT();
112
+ return;
113
+ }
114
+ }
115
+
116
+ _CUDA_PIPELINE_STATIC_QUALIFIER
117
+ void __pipeline_commit()
118
+ {
119
+ _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
120
+ }
121
+
122
+ _CUDA_PIPELINE_STATIC_QUALIFIER
123
+ void __pipeline_wait_prior(size_t prior)
124
+ {
125
+ switch (prior) {
126
+ case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); return;
127
+ case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); return;
128
+ case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); return;
129
+ case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); return;
130
+ case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); return;
131
+ case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); return;
132
+ case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); return;
133
+ case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); return;
134
+ default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); return;
135
+ }
136
+ }
137
+
138
+ # if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
139
+ # include "cuda_awbarrier_primitives.h"
140
+
141
+ _CUDA_PIPELINE_STATIC_QUALIFIER
142
+ void __pipeline_arrive_on(__mbarrier_t* barrier)
143
+ {
144
+ _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(barrier);
145
+ }
146
+ # endif
147
+
148
+ #endif /* !_CUDA_PIPELINE_PRIMITIVES_H_ */
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti.h ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2010-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(_CUPTI_H_)
51
+ #define _CUPTI_H_
52
+
53
+ #ifdef _WIN32
54
+ #ifndef WIN32_LEAN_AND_MEAN
55
+ #define WIN32_LEAN_AND_MEAN
56
+ #endif
57
+ #ifdef NOMINMAX
58
+ #include <windows.h>
59
+ #else
60
+ #define NOMINMAX
61
+ #include <windows.h>
62
+ #undef NOMINMAX
63
+ #endif
64
+ #endif
65
+
66
+ #include <cuda.h>
67
+ #include <cupti_result.h>
68
+ #include <cupti_version.h>
69
+
70
+ /* Activity, callback, event and metric APIs */
71
+ #include <cupti_activity.h>
72
+ #include <cupti_callbacks.h>
73
+ #include <cupti_events.h>
74
+ #include <cupti_metrics.h>
75
+
76
+ /* Runtime, driver, and nvtx function identifiers */
77
+ #include <cupti_driver_cbid.h>
78
+ #include <cupti_runtime_cbid.h>
79
+ #include <cupti_nvtx_cbid.h>
80
+
81
+ /* To support function parameter structures for obsoleted API. See
82
+ cuda.h for the actual definition of these structures. */
83
+ typedef unsigned int CUdeviceptr_v1;
84
+ typedef struct CUDA_MEMCPY2D_v1_st { int dummy; } CUDA_MEMCPY2D_v1;
85
+ typedef struct CUDA_MEMCPY3D_v1_st { int dummy; } CUDA_MEMCPY3D_v1;
86
+ typedef struct CUDA_ARRAY_DESCRIPTOR_v1_st { int dummy; } CUDA_ARRAY_DESCRIPTOR_v1;
87
+ typedef struct CUDA_ARRAY3D_DESCRIPTOR_v1_st { int dummy; } CUDA_ARRAY3D_DESCRIPTOR_v1;
88
+
89
+ /* Function parameter structures */
90
+ #include <generated_cuda_runtime_api_meta.h>
91
+ #include <generated_cuda_meta.h>
92
+
93
+ /* The following parameter structures cannot be included unless a
94
+ header that defines GL_VERSION is included before including them.
95
+ If these are needed then make sure such a header is included
96
+ already. */
97
+ #ifdef GL_VERSION
98
+ #include <generated_cuda_gl_interop_meta.h>
99
+ #include <generated_cudaGL_meta.h>
100
+ #endif
101
+
102
+ //#include <generated_nvtx_meta.h>
103
+
104
+ /* The following parameter structures cannot be included by default as
105
+ they are not guaranteed to be available on all systems. Uncomment
106
+ the includes that are available, or use the include explicitly. */
107
+ #if defined(__linux__)
108
+ //#include <generated_cuda_vdpau_interop_meta.h>
109
+ //#include <generated_cudaVDPAU_meta.h>
110
+ #endif
111
+
112
+ #ifdef _WIN32
113
+ //#include <generated_cuda_d3d9_interop_meta.h>
114
+ //#include <generated_cuda_d3d10_interop_meta.h>
115
+ //#include <generated_cuda_d3d11_interop_meta.h>
116
+ //#include <generated_cudaD3D9_meta.h>
117
+ //#include <generated_cudaD3D10_meta.h>
118
+ //#include <generated_cudaD3D11_meta.h>
119
+ #endif
120
+
121
+ #endif /*_CUPTI_H_*/
122
+
123
+
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_events.h ADDED
@@ -0,0 +1,1349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2010-2024 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(_CUPTI_EVENTS_H_)
51
+ #define _CUPTI_EVENTS_H_
52
+
53
+ #include <cuda.h>
54
+ #include <string.h>
55
+ #include <cuda_stdint.h>
56
+ #include <cupti_result.h>
57
+
58
+ #ifndef CUPTIAPI
59
+ #ifdef _WIN32
60
+ #define CUPTIAPI __stdcall
61
+ #else
62
+ #define CUPTIAPI
63
+ #endif
64
+ #endif
65
+
66
+ #if defined(__cplusplus)
67
+ extern "C" {
68
+ #endif
69
+
70
+ #if defined(__GNUC__) && defined(CUPTI_LIB)
71
+ #pragma GCC visibility push(default)
72
+ #endif
73
+
74
+ /**
75
+ * \defgroup CUPTI_EVENT_API CUPTI Event API
76
+ * Functions, types, and enums that implement the CUPTI Event API.
77
+ *
78
+ * \note The CUPTI event API from the header cupti_events.h is not supported on devices
79
+ * with compute capability 7.5 and higher (i.e. Turing and later GPU architectures).
80
+ * This API is deprecated in CUDA 12.8 release and will be removed in a future CUDA release.
81
+ * This is replaced by the host profiling API in the header cupti_profiler_host.h and
82
+ * target profiling API in the header cupti_range_profiler.h which are supported on
83
+ * devices with compute capability 7.0 and higher (i.e. Volta and later GPU architectures).
84
+ *
85
+ * @{
86
+ */
87
+
88
+ /**
89
+ * \brief ID for an event.
90
+ *
91
+ * An event represents a countable activity, action, or occurrence on
92
+ * the device.
93
+ */
94
+ typedef uint32_t CUpti_EventID;
95
+
96
+ /**
97
+ * \brief ID for an event domain.
98
+ *
99
+ * ID for an event domain. An event domain represents a group of
100
+ * related events. A device may have multiple instances of a domain,
101
+ * indicating that the device can simultaneously record multiple
102
+ * instances of each event within that domain.
103
+ */
104
+ typedef uint32_t CUpti_EventDomainID;
105
+
106
+ /**
107
+ * \brief A group of events.
108
+ *
109
+ * An event group is a collection of events that are managed
110
+ * together. All events in an event group must belong to the same
111
+ * domain.
112
+ */
113
+ typedef void *CUpti_EventGroup;
114
+
115
+ /**
116
+ * \brief Device class.
117
+ *
118
+ * Enumeration of device classes for device attribute
119
+ * CUPTI_DEVICE_ATTR_DEVICE_CLASS.
120
+ */
121
+ typedef enum {
122
+ CUPTI_DEVICE_ATTR_DEVICE_CLASS_TESLA = 0,
123
+ CUPTI_DEVICE_ATTR_DEVICE_CLASS_QUADRO = 1,
124
+ CUPTI_DEVICE_ATTR_DEVICE_CLASS_GEFORCE = 2,
125
+ CUPTI_DEVICE_ATTR_DEVICE_CLASS_TEGRA = 3,
126
+ } CUpti_DeviceAttributeDeviceClass;
127
+
128
+ /**
129
+ * \brief Device attributes.
130
+ *
131
+ * CUPTI device attributes. These attributes can be read using \ref
132
+ * cuptiDeviceGetAttribute.
133
+ */
134
+ typedef enum {
135
+ /**
136
+ * Number of event IDs for a device. Value is a uint32_t.
137
+ */
138
+ CUPTI_DEVICE_ATTR_MAX_EVENT_ID = 1,
139
+ /**
140
+ * Number of event domain IDs for a device. Value is a uint32_t.
141
+ */
142
+ CUPTI_DEVICE_ATTR_MAX_EVENT_DOMAIN_ID = 2,
143
+ /**
144
+ * Get global memory bandwidth in Kbytes/sec. Value is a uint64_t.
145
+ */
146
+ CUPTI_DEVICE_ATTR_GLOBAL_MEMORY_BANDWIDTH = 3,
147
+ /**
148
+ * Get theoretical maximum number of instructions per cycle. Value
149
+ * is a uint32_t.
150
+ */
151
+ CUPTI_DEVICE_ATTR_INSTRUCTION_PER_CYCLE = 4,
152
+ /**
153
+ * Get theoretical maximum number of single precision instructions
154
+ * that can be executed per second. Value is a uint64_t.
155
+ */
156
+ CUPTI_DEVICE_ATTR_INSTRUCTION_THROUGHPUT_SINGLE_PRECISION = 5,
157
+ /**
158
+ * Get number of frame buffers for device. Value is a uint64_t.
159
+ */
160
+ CUPTI_DEVICE_ATTR_MAX_FRAME_BUFFERS = 6,
161
+ /**
162
+ * Get PCIE link rate in Mega bits/sec for device. Return 0 if bus-type
163
+ * is non-PCIE. Value is a uint64_t.
164
+ */
165
+ CUPTI_DEVICE_ATTR_PCIE_LINK_RATE = 7,
166
+ /**
167
+ * Get PCIE link width for device. Return 0 if bus-type
168
+ * is non-PCIE. Value is a uint64_t.
169
+ */
170
+ CUPTI_DEVICE_ATTR_PCIE_LINK_WIDTH = 8,
171
+ /**
172
+ * Get PCIE generation for device. Return 0 if bus-type
173
+ * is non-PCIE. Value is a uint64_t.
174
+ */
175
+ CUPTI_DEVICE_ATTR_PCIE_GEN = 9,
176
+ /**
177
+ * Get the class for the device. Value is a
178
+ * CUpti_DeviceAttributeDeviceClass.
179
+ */
180
+ CUPTI_DEVICE_ATTR_DEVICE_CLASS = 10,
181
+ /**
182
+ * Get the peak single precision flop per cycle. Value is a uint64_t.
183
+ */
184
+ CUPTI_DEVICE_ATTR_FLOP_SP_PER_CYCLE = 11,
185
+ /**
186
+ * Get the peak double precision flop per cycle. Value is a uint64_t.
187
+ */
188
+ CUPTI_DEVICE_ATTR_FLOP_DP_PER_CYCLE = 12,
189
+ /**
190
+ * Get number of L2 units. Value is a uint64_t.
191
+ */
192
+ CUPTI_DEVICE_ATTR_MAX_L2_UNITS = 13,
193
+ /**
194
+ * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_SHARED
195
+ * preference. Value is a uint64_t.
196
+ */
197
+ CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_SHARED = 14,
198
+ /**
199
+ * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_L1
200
+ * preference. Value is a uint64_t.
201
+ */
202
+ CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_L1 = 15,
203
+ /**
204
+ * Get the maximum shared memory for the CU_FUNC_CACHE_PREFER_EQUAL
205
+ * preference. Value is a uint64_t.
206
+ */
207
+ CUPTI_DEVICE_ATTR_MAX_SHARED_MEMORY_CACHE_CONFIG_PREFER_EQUAL = 16,
208
+ /**
209
+ * Get the peak half precision flop per cycle. Value is a uint64_t.
210
+ */
211
+ CUPTI_DEVICE_ATTR_FLOP_HP_PER_CYCLE = 17,
212
+ /**
213
+ * Check if Nvlink is connected to device. Returns 1, if at least one
214
+ * Nvlink is connected to the device, returns 0 otherwise.
215
+ * Value is a uint32_t.
216
+ */
217
+ CUPTI_DEVICE_ATTR_NVLINK_PRESENT = 18,
218
+ /**
219
+ * Check if Nvlink is present between GPU and CPU. Returns Bandwidth,
220
+ * in Bytes/sec, if Nvlink is present, returns 0 otherwise.
221
+ * Value is a uint64_t.
222
+ */
223
+ CUPTI_DEVICE_ATTR_GPU_CPU_NVLINK_BW = 19,
224
+ /**
225
+ * Check if NVSwitch is present in the underlying topology.
226
+ * Returns 1, if present, returns 0 otherwise.
227
+ * Value is a uint32_t.
228
+ */
229
+ CUPTI_DEVICE_ATTR_NVSWITCH_PRESENT = 20,
230
+ CUPTI_DEVICE_ATTR_FORCE_INT = 0x7fffffff,
231
+ } CUpti_DeviceAttribute;
232
+
233
+ /**
234
+ * \brief Event domain attributes.
235
+ *
236
+ * Event domain attributes. Except where noted, all the attributes can
237
+ * be read using either \ref cuptiDeviceGetEventDomainAttribute or
238
+ * \ref cuptiEventDomainGetAttribute.
239
+ */
240
+ typedef enum {
241
+ /**
242
+ * Event domain name. Value is a null terminated const c-string.
243
+ */
244
+ CUPTI_EVENT_DOMAIN_ATTR_NAME = 0,
245
+ /**
246
+ * Number of instances of the domain for which event counts will be
247
+ * collected. The domain may have additional instances that cannot
248
+ * be profiled (see CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT).
249
+ * Can be read only with \ref
250
+ * cuptiDeviceGetEventDomainAttribute. Value is a uint32_t.
251
+ */
252
+ CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT = 1,
253
+ /**
254
+ * Total number of instances of the domain, including instances that
255
+ * cannot be profiled. Use CUPTI_EVENT_DOMAIN_ATTR_INSTANCE_COUNT
256
+ * to get the number of instances that can be profiled. Can be read
257
+ * only with \ref cuptiDeviceGetEventDomainAttribute. Value is a
258
+ * uint32_t.
259
+ */
260
+ CUPTI_EVENT_DOMAIN_ATTR_TOTAL_INSTANCE_COUNT = 3,
261
+ /**
262
+ * Collection method used for events contained in the event domain.
263
+ * Value is a \ref CUpti_EventCollectionMethod.
264
+ */
265
+ CUPTI_EVENT_DOMAIN_ATTR_COLLECTION_METHOD = 4,
266
+
267
+ CUPTI_EVENT_DOMAIN_ATTR_FORCE_INT = 0x7fffffff,
268
+ } CUpti_EventDomainAttribute;
269
+
270
+ /**
271
+ * \brief The collection method used for an event.
272
+ *
273
+ * The collection method indicates how an event is collected.
274
+ */
275
+ typedef enum {
276
+ /**
277
+ * Event is collected using a hardware global performance monitor.
278
+ */
279
+ CUPTI_EVENT_COLLECTION_METHOD_PM = 0,
280
+ /**
281
+ * Event is collected using a hardware SM performance monitor.
282
+ */
283
+ CUPTI_EVENT_COLLECTION_METHOD_SM = 1,
284
+ /**
285
+ * Event is collected using software instrumentation.
286
+ */
287
+ CUPTI_EVENT_COLLECTION_METHOD_INSTRUMENTED = 2,
288
+ /**
289
+ * Event is collected using NvLink throughput counter method.
290
+ */
291
+ CUPTI_EVENT_COLLECTION_METHOD_NVLINK_TC = 3,
292
+ CUPTI_EVENT_COLLECTION_METHOD_FORCE_INT = 0x7fffffff
293
+ } CUpti_EventCollectionMethod;
294
+
295
+ /**
296
+ * \brief Event group attributes.
297
+ *
298
+ * Event group attributes. These attributes can be read using \ref
299
+ * cuptiEventGroupGetAttribute. Attributes marked [rw] can also be
300
+ * written using \ref cuptiEventGroupSetAttribute.
301
+ */
302
+ typedef enum {
303
+ /**
304
+ * The domain to which the event group is bound. This attribute is
305
+ * set when the first event is added to the group. Value is a
306
+ * CUpti_EventDomainID.
307
+ */
308
+ CUPTI_EVENT_GROUP_ATTR_EVENT_DOMAIN_ID = 0,
309
+ /**
310
+ * [rw] Profile all the instances of the domain for this
311
+ * eventgroup. This feature can be used to get load balancing
312
+ * across all instances of a domain. Value is an integer.
313
+ */
314
+ CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES = 1,
315
+ /**
316
+ * [rw] Reserved for user data.
317
+ */
318
+ CUPTI_EVENT_GROUP_ATTR_USER_DATA = 2,
319
+ /**
320
+ * Number of events in the group. Value is a uint32_t.
321
+ */
322
+ CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS = 3,
323
+ /**
324
+ * Enumerates events in the group. Value is a pointer to buffer of
325
+ * size sizeof(CUpti_EventID) * num_of_events in the eventgroup.
326
+ * num_of_events can be queried using
327
+ * CUPTI_EVENT_GROUP_ATTR_NUM_EVENTS.
328
+ */
329
+ CUPTI_EVENT_GROUP_ATTR_EVENTS = 4,
330
+ /**
331
+ * Number of instances of the domain bound to this event group that
332
+ * will be counted. Value is a uint32_t.
333
+ */
334
+ CUPTI_EVENT_GROUP_ATTR_INSTANCE_COUNT = 5,
335
+ /**
336
+ * Event group scope can be set to CUPTI_EVENT_PROFILING_SCOPE_DEVICE or
337
+ * CUPTI_EVENT_PROFILING_SCOPE_CONTEXT for an eventGroup, before
338
+ * adding any event.
339
+ * Sets the scope of eventgroup as CUPTI_EVENT_PROFILING_SCOPE_DEVICE or
340
+ * CUPTI_EVENT_PROFILING_SCOPE_CONTEXT when the scope of the events
341
+ * that will be added is CUPTI_EVENT_PROFILING_SCOPE_BOTH.
342
+ * If profiling scope of event is either
343
+ * CUPTI_EVENT_PROFILING_SCOPE_DEVICE or CUPTI_EVENT_PROFILING_SCOPE_CONTEXT
344
+ * then setting this attribute will not affect the default scope.
345
+ * It is not allowed to add events of different scope to same eventgroup.
346
+ * Value is a uint32_t.
347
+ */
348
+ CUPTI_EVENT_GROUP_ATTR_PROFILING_SCOPE = 6,
349
+ CUPTI_EVENT_GROUP_ATTR_FORCE_INT = 0x7fffffff,
350
+ } CUpti_EventGroupAttribute;
351
+
352
+ /**
353
+ * \brief Profiling scope for event.
354
+ *
355
+ * Profiling scope of event indicates if the event can be collected at context
356
+ * scope or device scope or both i.e. it can be collected at any of context or
357
+ * device scope.
358
+ */
359
+ typedef enum {
360
+ /**
361
+ * Event is collected at context scope.
362
+ */
363
+ CUPTI_EVENT_PROFILING_SCOPE_CONTEXT = 0,
364
+ /**
365
+ * Event is collected at device scope.
366
+ */
367
+ CUPTI_EVENT_PROFILING_SCOPE_DEVICE = 1,
368
+ /**
369
+ * Event can be collected at device or context scope.
370
+ * The scope can be set using \ref cuptiEventGroupSetAttribute API.
371
+ */
372
+ CUPTI_EVENT_PROFILING_SCOPE_BOTH = 2,
373
+ CUPTI_EVENT_PROFILING_SCOPE_FORCE_INT = 0x7fffffff
374
+ } CUpti_EventProfilingScope;
375
+
376
+ /**
377
+ * \brief Event attributes.
378
+ *
379
+ * Event attributes. These attributes can be read using \ref
380
+ * cuptiEventGetAttribute.
381
+ */
382
+ typedef enum {
383
+ /**
384
+ * Event name. Value is a null terminated const c-string.
385
+ */
386
+ CUPTI_EVENT_ATTR_NAME = 0,
387
+ /**
388
+ * Short description of event. Value is a null terminated const
389
+ * c-string.
390
+ */
391
+ CUPTI_EVENT_ATTR_SHORT_DESCRIPTION = 1,
392
+ /**
393
+ * Long description of event. Value is a null terminated const
394
+ * c-string.
395
+ */
396
+ CUPTI_EVENT_ATTR_LONG_DESCRIPTION = 2,
397
+ /**
398
+ * Category of event. Value is CUpti_EventCategory.
399
+ */
400
+ CUPTI_EVENT_ATTR_CATEGORY = 3,
401
+ /**
402
+ * Profiling scope of the events. It can be either device or context or both.
403
+ * Value is a \ref CUpti_EventProfilingScope.
404
+ */
405
+ CUPTI_EVENT_ATTR_PROFILING_SCOPE = 5,
406
+
407
+ CUPTI_EVENT_ATTR_FORCE_INT = 0x7fffffff,
408
+ } CUpti_EventAttribute;
409
+
410
+ /**
411
+ * \brief Event collection modes.
412
+ *
413
+ * The event collection mode determines the period over which the
414
+ * events within the enabled event groups will be collected.
415
+ */
416
+ typedef enum {
417
+ /**
418
+ * Events are collected for the entire duration between the
419
+ * cuptiEventGroupEnable and cuptiEventGroupDisable calls.
420
+ * Event values are reset when the events are read.
421
+ * For CUDA toolkit v6.0 and older this was the default mode.
422
+ */
423
+ CUPTI_EVENT_COLLECTION_MODE_CONTINUOUS = 0,
424
+ /**
425
+ * Events are collected only for the durations of kernel executions
426
+ * that occur between the cuptiEventGroupEnable and
427
+ * cuptiEventGroupDisable calls. Event collection begins when a
428
+ * kernel execution begins, and stops when kernel execution
429
+ * completes. Event values are reset to zero when each kernel
430
+ * execution begins. If multiple kernel executions occur between the
431
+ * cuptiEventGroupEnable and cuptiEventGroupDisable calls then the
432
+ * event values must be read after each kernel launch if those
433
+ * events need to be associated with the specific kernel launch.
434
+ * Note that collection in this mode may significantly change the
435
+ * overall performance characteristics of the application because
436
+ * kernel executions that occur between the cuptiEventGroupEnable and
437
+ * cuptiEventGroupDisable calls are serialized on the GPU.
438
+ * This is the default mode from CUDA toolkit v6.5
439
+ */
440
+ CUPTI_EVENT_COLLECTION_MODE_KERNEL = 1,
441
+ CUPTI_EVENT_COLLECTION_MODE_FORCE_INT = 0x7fffffff
442
+ } CUpti_EventCollectionMode;
443
+
444
+ /**
445
+ * \brief An event category.
446
+ *
447
+ * Each event is assigned to a category that represents the general
448
+ * type of the event. A event's category is accessed using \ref
449
+ * cuptiEventGetAttribute and the CUPTI_EVENT_ATTR_CATEGORY attribute.
450
+ */
451
+ typedef enum {
452
+ /**
453
+ * An instruction related event.
454
+ */
455
+ CUPTI_EVENT_CATEGORY_INSTRUCTION = 0,
456
+ /**
457
+ * A memory related event.
458
+ */
459
+ CUPTI_EVENT_CATEGORY_MEMORY = 1,
460
+ /**
461
+ * A cache related event.
462
+ */
463
+ CUPTI_EVENT_CATEGORY_CACHE = 2,
464
+ /**
465
+ * A profile-trigger event.
466
+ */
467
+ CUPTI_EVENT_CATEGORY_PROFILE_TRIGGER = 3,
468
+ /**
469
+ * A system event.
470
+ */
471
+ CUPTI_EVENT_CATEGORY_SYSTEM = 4,
472
+ CUPTI_EVENT_CATEGORY_FORCE_INT = 0x7fffffff
473
+ } CUpti_EventCategory;
474
+
475
+ /**
476
+ * \brief The overflow value for a CUPTI event.
477
+ *
478
+ * The CUPTI event value that indicates an overflow.
479
+ */
480
+ #define CUPTI_EVENT_OVERFLOW ((uint64_t)0xFFFFFFFFFFFFFFFFULL)
481
+
482
+ /**
483
+ * \brief The value that indicates the event value is invalid
484
+ */
485
+ #define CUPTI_EVENT_INVALID ((uint64_t)0xFFFFFFFFFFFFFFFEULL)
486
+
487
+ /**
488
+ * \brief Flags for cuptiEventGroupReadEvent an
489
+ * cuptiEventGroupReadAllEvents.
490
+ *
491
+ * Flags for \ref cuptiEventGroupReadEvent an \ref
492
+ * cuptiEventGroupReadAllEvents.
493
+ */
494
+ typedef enum {
495
+ /**
496
+ * No flags.
497
+ */
498
+ CUPTI_EVENT_READ_FLAG_NONE = 0,
499
+ CUPTI_EVENT_READ_FLAG_FORCE_INT = 0x7fffffff,
500
+ } CUpti_ReadEventFlags;
501
+
502
+
503
+ /**
504
+ * \brief A set of event groups.
505
+ *
506
+ * A set of event groups. When returned by \ref
507
+ * cuptiEventGroupSetsCreate and \ref cuptiMetricCreateEventGroupSets
508
+ * a set indicates that event groups that can be enabled at the same
509
+ * time (i.e. all the events in the set can be collected
510
+ * simultaneously).
511
+ */
512
+ typedef struct {
513
+ /**
514
+ * The number of event groups in the set.
515
+ */
516
+ uint32_t numEventGroups;
517
+ /**
518
+ * An array of \p numEventGroups event groups.
519
+ */
520
+ CUpti_EventGroup *eventGroups;
521
+ } CUpti_EventGroupSet;
522
+
523
+ /**
524
+ * \brief A set of event group sets.
525
+ *
526
+ * A set of event group sets. When returned by \ref
527
+ * cuptiEventGroupSetsCreate and \ref cuptiMetricCreateEventGroupSets
528
+ * a CUpti_EventGroupSets indicates the number of passes required to
529
+ * collect all the events, and the event groups that should be
530
+ * collected during each pass.
531
+ */
532
+ typedef struct {
533
+ /**
534
+ * Number of event group sets.
535
+ */
536
+ uint32_t numSets;
537
+ /**
538
+ * An array of \p numSets event group sets.
539
+ */
540
+ CUpti_EventGroupSet *sets;
541
+ } CUpti_EventGroupSets;
542
+
543
+ /**
544
+ * \brief Set the event collection mode.
545
+ *
546
+ * Set the event collection mode for a \p context. The \p mode
547
+ * controls the event collection behavior of all events in event
548
+ * groups created in the \p context. This API is invalid in kernel
549
+ * replay mode.
550
+ * \note \b Thread-safety: this function is thread safe.
551
+ *
552
+ * \param context The context
553
+ * \param mode The event collection mode
554
+ *
555
+ * \retval CUPTI_SUCCESS
556
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
557
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
558
+ * \retval CUPTI_ERROR_INVALID_OPERATION if called when replay mode is enabled
559
+ * \retval CUPTI_ERROR_NOT_SUPPORTED if mode is not supported on the device
560
+ */
561
+
562
+ CUptiResult CUPTIAPI cuptiSetEventCollectionMode(CUcontext context,
563
+ CUpti_EventCollectionMode mode);
564
+
565
+ /**
566
+ * \brief Read a device attribute.
567
+ *
568
+ * Read a device attribute and return it in \p *value.
569
+ * \note \b Thread-safety: this function is thread safe.
570
+ *
571
+ * \param device The CUDA device
572
+ * \param attrib The attribute to read
573
+ * \param valueSize Size of buffer pointed by the value, and
574
+ * returns the number of bytes written to \p value
575
+ * \param value Returns the value of the attribute
576
+ *
577
+ * \retval CUPTI_SUCCESS
578
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
579
+ * \retval CUPTI_ERROR_INVALID_DEVICE
580
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
581
+ * is NULL, or if \p attrib is not a device attribute
582
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
583
+ * attribute values, indicates that the \p value buffer is too small
584
+ * to hold the attribute value.
585
+ */
586
+ CUptiResult CUPTIAPI cuptiDeviceGetAttribute(CUdevice device,
587
+ CUpti_DeviceAttribute attrib,
588
+ size_t *valueSize,
589
+ void *value);
590
+
591
+ /**
592
+ * \brief Get the number of domains for a device.
593
+ *
594
+ * Returns the number of domains in \p numDomains for a device.
595
+ * \note \b Thread-safety: this function is thread safe.
596
+ *
597
+ * \param device The CUDA device
598
+ * \param numDomains Returns the number of domains
599
+ *
600
+ * \retval CUPTI_SUCCESS
601
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
602
+ * \retval CUPTI_ERROR_INVALID_DEVICE
603
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numDomains is NULL
604
+ */
605
+ CUptiResult CUPTIAPI cuptiDeviceGetNumEventDomains(CUdevice device,
606
+ uint32_t *numDomains);
607
+
608
+ /**
609
+ * \brief Get the event domains for a device.
610
+ *
611
+ * Returns the event domains IDs in \p domainArray for a device. The
612
+ * size of the \p domainArray buffer is given by \p
613
+ * *arraySizeBytes. The size of the \p domainArray buffer must be at
614
+ * least \p numdomains * sizeof(CUpti_EventDomainID) or else all
615
+ * domains will not be returned. The value returned in \p
616
+ * *arraySizeBytes contains the number of bytes returned in \p
617
+ * domainArray.
618
+ * \note \b Thread-safety: this function is thread safe.
619
+ *
620
+ * \param device The CUDA device
621
+ * \param arraySizeBytes The size of \p domainArray in bytes, and
622
+ * returns the number of bytes written to \p domainArray
623
+ * \param domainArray Returns the IDs of the event domains for the device
624
+ *
625
+ * \retval CUPTI_SUCCESS
626
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
627
+ * \retval CUPTI_ERROR_INVALID_DEVICE
628
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
629
+ * \p domainArray are NULL
630
+ */
631
+ CUptiResult CUPTIAPI cuptiDeviceEnumEventDomains(CUdevice device,
632
+ size_t *arraySizeBytes,
633
+ CUpti_EventDomainID *domainArray);
634
+
635
+ /**
636
+ * \brief Read an event domain attribute.
637
+ *
638
+ * Returns an event domain attribute in \p *value. The size of the \p
639
+ * value buffer is given by \p *valueSize. The value returned in \p
640
+ * *valueSize contains the number of bytes returned in \p value.
641
+ *
642
+ * If the attribute value is a c-string that is longer than \p
643
+ * *valueSize, then only the first \p *valueSize characters will be
644
+ * returned and there will be no terminating null byte.
645
+ * \note \b Thread-safety: this function is thread safe.
646
+ *
647
+ * \param device The CUDA device
648
+ * \param eventDomain ID of the event domain
649
+ * \param attrib The event domain attribute to read
650
+ * \param valueSize The size of the \p value buffer in bytes, and
651
+ * returns the number of bytes written to \p value
652
+ * \param value Returns the attribute's value
653
+ *
654
+ * \retval CUPTI_SUCCESS
655
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
656
+ * \retval CUPTI_ERROR_INVALID_DEVICE
657
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
658
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
659
+ * is NULL, or if \p attrib is not an event domain attribute
660
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
661
+ * attribute values, indicates that the \p value buffer is too small
662
+ * to hold the attribute value.
663
+ */
664
+ CUptiResult CUPTIAPI cuptiDeviceGetEventDomainAttribute(CUdevice device,
665
+ CUpti_EventDomainID eventDomain,
666
+ CUpti_EventDomainAttribute attrib,
667
+ size_t *valueSize,
668
+ void *value);
669
+
670
+ /**
671
+ * \brief Get the number of event domains available on any device.
672
+ *
673
+ * Returns the total number of event domains available on any
674
+ * CUDA-capable device.
675
+ * \note \b Thread-safety: this function is thread safe.
676
+ *
677
+ * \param numDomains Returns the number of domains
678
+ *
679
+ * \retval CUPTI_SUCCESS
680
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numDomains is NULL
681
+ */
682
+ CUptiResult CUPTIAPI cuptiGetNumEventDomains(uint32_t *numDomains);
683
+
684
+ /**
685
+ * \brief Get the event domains available on any device.
686
+ *
687
+ * Returns all the event domains available on any CUDA-capable device.
688
+ * Event domain IDs are returned in \p domainArray. The size of the \p
689
+ * domainArray buffer is given by \p *arraySizeBytes. The size of the
690
+ * \p domainArray buffer must be at least \p numDomains *
691
+ * sizeof(CUpti_EventDomainID) or all domains will not be
692
+ * returned. The value returned in \p *arraySizeBytes contains the
693
+ * number of bytes returned in \p domainArray.
694
+ * \note \b Thread-safety: this function is thread safe.
695
+ *
696
+ * \param arraySizeBytes The size of \p domainArray in bytes, and
697
+ * returns the number of bytes written to \p domainArray
698
+ * \param domainArray Returns all the event domains
699
+ *
700
+ * \retval CUPTI_SUCCESS
701
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or
702
+ * \p domainArray are NULL
703
+ */
704
+ CUptiResult CUPTIAPI cuptiEnumEventDomains(size_t *arraySizeBytes,
705
+ CUpti_EventDomainID *domainArray);
706
+
707
+ /**
708
+ * \brief Read an event domain attribute.
709
+ *
710
+ * Returns an event domain attribute in \p *value. The size of the \p
711
+ * value buffer is given by \p *valueSize. The value returned in \p
712
+ * *valueSize contains the number of bytes returned in \p value.
713
+ *
714
+ * If the attribute value is a c-string that is longer than \p
715
+ * *valueSize, then only the first \p *valueSize characters will be
716
+ * returned and there will be no terminating null byte.
717
+ * \note \b Thread-safety: this function is thread safe.
718
+ *
719
+ * \param eventDomain ID of the event domain
720
+ * \param attrib The event domain attribute to read
721
+ * \param valueSize The size of the \p value buffer in bytes, and
722
+ * returns the number of bytes written to \p value
723
+ * \param value Returns the attribute's value
724
+ *
725
+ * \retval CUPTI_SUCCESS
726
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
727
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
728
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
729
+ * is NULL, or if \p attrib is not an event domain attribute
730
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
731
+ * attribute values, indicates that the \p value buffer is too small
732
+ * to hold the attribute value.
733
+ */
734
+ CUptiResult CUPTIAPI cuptiEventDomainGetAttribute(CUpti_EventDomainID eventDomain,
735
+ CUpti_EventDomainAttribute attrib,
736
+ size_t *valueSize,
737
+ void *value);
738
+
739
+ /**
740
+ * \brief Get number of events in a domain.
741
+ *
742
+ * Returns the number of events in \p numEvents for a domain.
743
+ * \note \b Thread-safety: this function is thread safe.
744
+ *
745
+ * \param eventDomain ID of the event domain
746
+ * \param numEvents Returns the number of events in the domain
747
+ *
748
+ * \retval CUPTI_SUCCESS
749
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
750
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
751
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p numEvents is NULL
752
+ */
753
+ CUptiResult CUPTIAPI cuptiEventDomainGetNumEvents(CUpti_EventDomainID eventDomain,
754
+ uint32_t *numEvents);
755
+
756
+ /**
757
+ * \brief Get the events in a domain.
758
+ *
759
+ * Returns the event IDs in \p eventArray for a domain. The size of
760
+ * the \p eventArray buffer is given by \p *arraySizeBytes. The size
761
+ * of the \p eventArray buffer must be at least \p numdomainevents *
762
+ * sizeof(CUpti_EventID) or else all events will not be returned. The
763
+ * value returned in \p *arraySizeBytes contains the number of bytes
764
+ * returned in \p eventArray.
765
+ * \note \b Thread-safety: this function is thread safe.
766
+ *
767
+ * \param eventDomain ID of the event domain
768
+ * \param arraySizeBytes The size of \p eventArray in bytes, and
769
+ * returns the number of bytes written to \p eventArray
770
+ * \param eventArray Returns the IDs of the events in the domain
771
+ *
772
+ * \retval CUPTI_SUCCESS
773
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
774
+ * \retval CUPTI_ERROR_INVALID_EVENT_DOMAIN_ID
775
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p arraySizeBytes or \p
776
+ * eventArray are NULL
777
+ */
778
+ CUptiResult CUPTIAPI cuptiEventDomainEnumEvents(CUpti_EventDomainID eventDomain,
779
+ size_t *arraySizeBytes,
780
+ CUpti_EventID *eventArray);
781
+
782
+ /**
783
+ * \brief Get an event attribute.
784
+ *
785
+ * Returns an event attribute in \p *value. The size of the \p
786
+ * value buffer is given by \p *valueSize. The value returned in \p
787
+ * *valueSize contains the number of bytes returned in \p value.
788
+ *
789
+ * If the attribute value is a c-string that is longer than \p
790
+ * *valueSize, then only the first \p *valueSize characters will be
791
+ * returned and there will be no terminating null byte.
792
+ * \note \b Thread-safety: this function is thread safe.
793
+ *
794
+ * \param event ID of the event
795
+ * \param attrib The event attribute to read
796
+ * \param valueSize The size of the \p value buffer in bytes, and
797
+ * returns the number of bytes written to \p value
798
+ * \param value Returns the attribute's value
799
+ *
800
+ * \retval CUPTI_SUCCESS
801
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
802
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
803
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
804
+ * is NULL, or if \p attrib is not an event attribute
805
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
806
+ * attribute values, indicates that the \p value buffer is too small
807
+ * to hold the attribute value.
808
+ */
809
+ CUptiResult CUPTIAPI cuptiEventGetAttribute(CUpti_EventID event,
810
+ CUpti_EventAttribute attrib,
811
+ size_t *valueSize,
812
+ void *value);
813
+
814
+ /**
815
+ * \brief Find an event by name.
816
+ *
817
+ * Find an event by name and return the event ID in \p *event.
818
+ * \note \b Thread-safety: this function is thread safe.
819
+ *
820
+ * \param device The CUDA device
821
+ * \param eventName The name of the event to find
822
+ * \param event Returns the ID of the found event or undefined if
823
+ * unable to find the event
824
+ *
825
+ * \retval CUPTI_SUCCESS
826
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
827
+ * \retval CUPTI_ERROR_INVALID_DEVICE
828
+ * \retval CUPTI_ERROR_INVALID_EVENT_NAME if unable to find an event
829
+ * with name \p eventName. In this case \p *event is undefined
830
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventName or \p event are NULL
831
+ */
832
+ CUptiResult CUPTIAPI cuptiEventGetIdFromName(CUdevice device,
833
+ const char *eventName,
834
+ CUpti_EventID *event);
835
+
836
+ /**
837
+ * \brief Create a new event group for a context.
838
+ *
839
+ * Creates a new event group for \p context and returns the new group
840
+ * in \p *eventGroup.
841
+ * \note \p flags are reserved for future use and should be set to zero.
842
+ * \note \b Thread-safety: this function is thread safe.
843
+ *
844
+ * \param context The context for the event group
845
+ * \param eventGroup Returns the new event group
846
+ * \param flags Reserved - must be zero
847
+ *
848
+ * \retval CUPTI_SUCCESS
849
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
850
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
851
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY
852
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
853
+ */
854
+ CUptiResult CUPTIAPI cuptiEventGroupCreate(CUcontext context,
855
+ CUpti_EventGroup *eventGroup,
856
+ uint32_t flags);
857
+
858
+ /**
859
+ * \brief Destroy an event group.
860
+ *
861
+ * Destroy an \p eventGroup and free its resources. An event group
862
+ * cannot be destroyed if it is enabled.
863
+ * \note \b Thread-safety: this function is thread safe.
864
+ *
865
+ * \param eventGroup The event group to destroy
866
+ *
867
+ * \retval CUPTI_SUCCESS
868
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
869
+ * \retval CUPTI_ERROR_INVALID_OPERATION if the event group is enabled
870
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if eventGroup is NULL
871
+ */
872
+ CUptiResult CUPTIAPI cuptiEventGroupDestroy(CUpti_EventGroup eventGroup);
873
+
874
+ /**
875
+ * \brief Read an event group attribute.
876
+ *
877
+ * Read an event group attribute and return it in \p *value.
878
+ * \note \b Thread-safety: this function is thread safe but client
879
+ * must guard against simultaneous destruction or modification of \p
880
+ * eventGroup (for example, client must guard against simultaneous
881
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
882
+ * etc.), and must guard against simultaneous destruction of the
883
+ * context in which \p eventGroup was created (for example, client
884
+ * must guard against simultaneous calls to cudaDeviceReset,
885
+ * cuCtxDestroy, etc.).
886
+ *
887
+ * \param eventGroup The event group
888
+ * \param attrib The attribute to read
889
+ * \param valueSize Size of buffer pointed by the value, and
890
+ * returns the number of bytes written to \p value
891
+ * \param value Returns the value of the attribute
892
+ *
893
+ * \retval CUPTI_SUCCESS
894
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
895
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
896
+ * is NULL, or if \p attrib is not an eventgroup attribute
897
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT For non-c-string
898
+ * attribute values, indicates that the \p value buffer is too small
899
+ * to hold the attribute value.
900
+ */
901
+ CUptiResult CUPTIAPI cuptiEventGroupGetAttribute(CUpti_EventGroup eventGroup,
902
+ CUpti_EventGroupAttribute attrib,
903
+ size_t *valueSize,
904
+ void *value);
905
+
906
+ /**
907
+ * \brief Write an event group attribute.
908
+ *
909
+ * Write an event group attribute.
910
+ * \note \b Thread-safety: this function is thread safe.
911
+ *
912
+ * \param eventGroup The event group
913
+ * \param attrib The attribute to write
914
+ * \param valueSize The size, in bytes, of the value
915
+ * \param value The attribute value to write
916
+ *
917
+ * \retval CUPTI_SUCCESS
918
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
919
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p valueSize or \p value
920
+ * is NULL, or if \p attrib is not an event group attribute, or if
921
+ * \p attrib is not a writable attribute
922
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT Indicates that
923
+ * the \p value buffer is too small to hold the attribute value.
924
+ */
925
+ CUptiResult CUPTIAPI cuptiEventGroupSetAttribute(CUpti_EventGroup eventGroup,
926
+ CUpti_EventGroupAttribute attrib,
927
+ size_t valueSize,
928
+ void *value);
929
+
930
+ /**
931
+ * \brief Add an event to an event group.
932
+ *
933
+ * Add an event to an event group. The event add can fail for a number of reasons:
934
+ * \li The event group is enabled
935
+ * \li The event does not belong to the same event domain as the
936
+ * events that are already in the event group
937
+ * \li Device limitations on the events that can belong to the same group
938
+ * \li The event group is full
939
+ *
940
+ * \note \b Thread-safety: this function is thread safe.
941
+ *
942
+ * \param eventGroup The event group
943
+ * \param event The event to add to the group
944
+ *
945
+ * \retval CUPTI_SUCCESS
946
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
947
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
948
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY
949
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
950
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p event belongs to a
951
+ * different event domain than the events already in \p eventGroup, or
952
+ * if a device limitation prevents \p event from being collected at
953
+ * the same time as the events already in \p eventGroup
954
+ * \retval CUPTI_ERROR_MAX_LIMIT_REACHED if \p eventGroup is full
955
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
956
+ */
957
+ CUptiResult CUPTIAPI cuptiEventGroupAddEvent(CUpti_EventGroup eventGroup,
958
+ CUpti_EventID event);
959
+
960
+ /**
961
+ * \brief Remove an event from an event group.
962
+ *
963
+ * Remove \p event from the an event group. The event cannot be
964
+ * removed if the event group is enabled.
965
+ * \note \b Thread-safety: this function is thread safe.
966
+ *
967
+ * \param eventGroup The event group
968
+ * \param event The event to remove from the group
969
+ *
970
+ * \retval CUPTI_SUCCESS
971
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
972
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
973
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
974
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
975
+ */
976
+ CUptiResult CUPTIAPI cuptiEventGroupRemoveEvent(CUpti_EventGroup eventGroup,
977
+ CUpti_EventID event);
978
+
979
+ /**
980
+ * \brief Remove all events from an event group.
981
+ *
982
+ * Remove all events from an event group. Events cannot be removed if
983
+ * the event group is enabled.
984
+ * \note \b Thread-safety: this function is thread safe.
985
+ *
986
+ * \param eventGroup The event group
987
+ *
988
+ * \retval CUPTI_SUCCESS
989
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
990
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is enabled
991
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
992
+ */
993
+ CUptiResult CUPTIAPI cuptiEventGroupRemoveAllEvents(CUpti_EventGroup eventGroup);
994
+
995
+ /**
996
+ * \brief Zero all the event counts in an event group.
997
+ *
998
+ * Zero all the event counts in an event group.
999
+ * \note \b Thread-safety: this function is thread safe but client
1000
+ * must guard against simultaneous destruction or modification of \p
1001
+ * eventGroup (for example, client must guard against simultaneous
1002
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
1003
+ * etc.), and must guard against simultaneous destruction of the
1004
+ * context in which \p eventGroup was created (for example, client
1005
+ * must guard against simultaneous calls to cudaDeviceReset,
1006
+ * cuCtxDestroy, etc.).
1007
+ *
1008
+ * \param eventGroup The event group
1009
+ *
1010
+ * \retval CUPTI_SUCCESS
1011
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
1012
+ * \retval CUPTI_ERROR_HARDWARE
1013
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
1014
+ */
1015
+ CUptiResult CUPTIAPI cuptiEventGroupResetAllEvents(CUpti_EventGroup eventGroup);
1016
+
1017
+ /**
1018
+ * \brief Enable an event group.
1019
+ *
1020
+ * Enable an event group. Enabling an event group zeros the value of
1021
+ * all the events in the group and then starts collection of those
1022
+ * events.
1023
+ * \note \b Thread-safety: this function is thread safe.
1024
+ *
1025
+ * \param eventGroup The event group
1026
+ *
1027
+ * \retval CUPTI_SUCCESS
1028
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
1029
+ * \retval CUPTI_ERROR_HARDWARE
1030
+ * \retval CUPTI_ERROR_NOT_READY if \p eventGroup does not contain any events
1031
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p eventGroup cannot be
1032
+ * enabled due to other already enabled event groups
1033
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
1034
+ * \retval CUPTI_ERROR_HARDWARE_BUSY if another client is profiling
1035
+ * and hardware is busy
1036
+ */
1037
+ CUptiResult CUPTIAPI cuptiEventGroupEnable(CUpti_EventGroup eventGroup);
1038
+
1039
+ /**
1040
+ * \brief Disable an event group.
1041
+ *
1042
+ * Disable an event group. Disabling an event group stops collection
1043
+ * of events contained in the group.
1044
+ * \note \b Thread-safety: this function is thread safe.
1045
+ *
1046
+ * \param eventGroup The event group
1047
+ *
1048
+ * \retval CUPTI_SUCCESS
1049
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
1050
+ * \retval CUPTI_ERROR_HARDWARE
1051
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup is NULL
1052
+ */
1053
+ CUptiResult CUPTIAPI cuptiEventGroupDisable(CUpti_EventGroup eventGroup);
1054
+
1055
+ /**
1056
+ * \brief Read the value for an event in an event group.
1057
+ *
1058
+ * Read the value for an event in an event group. The event value is
1059
+ * returned in the \p eventValueBuffer buffer. \p
1060
+ * eventValueBufferSizeBytes indicates the size of the \p
1061
+ * eventValueBuffer buffer. The buffer must be at least sizeof(uint64)
1062
+ * if ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is not set
1063
+ * on the group containing the event. The buffer must be at least
1064
+ * (sizeof(uint64) * number of domain instances) if
1065
+ * ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is set on the
1066
+ * group.
1067
+ *
1068
+ * If any instance of an event counter overflows, the value returned
1069
+ * for that event instance will be ::CUPTI_EVENT_OVERFLOW.
1070
+ *
1071
+ * The only allowed value for \p flags is ::CUPTI_EVENT_READ_FLAG_NONE.
1072
+ *
1073
+ * Reading an event from a disabled event group is not allowed. After
1074
+ * being read, an event's value is reset to zero.
1075
+ * \note \b Thread-safety: this function is thread safe but client
1076
+ * must guard against simultaneous destruction or modification of \p
1077
+ * eventGroup (for example, client must guard against simultaneous
1078
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
1079
+ * etc.), and must guard against simultaneous destruction of the
1080
+ * context in which \p eventGroup was created (for example, client
1081
+ * must guard against simultaneous calls to cudaDeviceReset,
1082
+ * cuCtxDestroy, etc.). If \ref cuptiEventGroupResetAllEvents is
1083
+ * called simultaneously with this function, then returned event
1084
+ * values are undefined.
1085
+ *
1086
+ * \param eventGroup The event group
1087
+ * \param flags Flags controlling the reading mode
1088
+ * \param event The event to read
1089
+ * \param eventValueBufferSizeBytes The size of \p eventValueBuffer
1090
+ * in bytes, and returns the number of bytes written to \p
1091
+ * eventValueBuffer
1092
+ * \param eventValueBuffer Returns the event value(s)
1093
+ *
1094
+ * \retval CUPTI_SUCCESS
1095
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
1096
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
1097
+ * \retval CUPTI_ERROR_HARDWARE
1098
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is disabled
1099
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup, \p
1100
+ * eventValueBufferSizeBytes or \p eventValueBuffer is NULL
1101
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if size of \p eventValueBuffer
1102
+ * is not sufficient
1103
+ */
1104
+ CUptiResult CUPTIAPI cuptiEventGroupReadEvent(CUpti_EventGroup eventGroup,
1105
+ CUpti_ReadEventFlags flags,
1106
+ CUpti_EventID event,
1107
+ size_t *eventValueBufferSizeBytes,
1108
+ uint64_t *eventValueBuffer);
1109
+
1110
+ /**
1111
+ * \brief Read the values for all the events in an event group.
1112
+ *
1113
+ * Read the values for all the events in an event group. The event
1114
+ * values are returned in the \p eventValueBuffer buffer. \p
1115
+ * eventValueBufferSizeBytes indicates the size of \p
1116
+ * eventValueBuffer. The buffer must be at least (sizeof(uint64) *
1117
+ * number of events in group) if
1118
+ * ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is not set on
1119
+ * the group containing the events. The buffer must be at least
1120
+ * (sizeof(uint64) * number of domain instances * number of events in
1121
+ * group) if ::CUPTI_EVENT_GROUP_ATTR_PROFILE_ALL_DOMAIN_INSTANCES is
1122
+ * set on the group.
1123
+ *
1124
+ * The data format returned in \p eventValueBuffer is:
1125
+ * - domain instance 0: event0 event1 ... eventN
1126
+ * - domain instance 1: event0 event1 ... eventN
1127
+ * - ...
1128
+ * - domain instance M: event0 event1 ... eventN
1129
+ *
1130
+ * The event order in \p eventValueBuffer is returned in \p
1131
+ * eventIdArray. The size of \p eventIdArray is specified in \p
1132
+ * eventIdArraySizeBytes. The size should be at least
1133
+ * (sizeof(CUpti_EventID) * number of events in group).
1134
+ *
1135
+ * If any instance of any event counter overflows, the value returned
1136
+ * for that event instance will be ::CUPTI_EVENT_OVERFLOW.
1137
+ *
1138
+ * The only allowed value for \p flags is ::CUPTI_EVENT_READ_FLAG_NONE.
1139
+ *
1140
+ * Reading events from a disabled event group is not allowed. After
1141
+ * being read, an event's value is reset to zero.
1142
+ * \note \b Thread-safety: this function is thread safe but client
1143
+ * must guard against simultaneous destruction or modification of \p
1144
+ * eventGroup (for example, client must guard against simultaneous
1145
+ * calls to \ref cuptiEventGroupDestroy, \ref cuptiEventGroupAddEvent,
1146
+ * etc.), and must guard against simultaneous destruction of the
1147
+ * context in which \p eventGroup was created (for example, client
1148
+ * must guard against simultaneous calls to cudaDeviceReset,
1149
+ * cuCtxDestroy, etc.). If \ref cuptiEventGroupResetAllEvents is
1150
+ * called simultaneously with this function, then returned event
1151
+ * values are undefined.
1152
+ *
1153
+ * \param eventGroup The event group
1154
+ * \param flags Flags controlling the reading mode
1155
+ * \param eventValueBufferSizeBytes The size of \p eventValueBuffer in
1156
+ * bytes, and returns the number of bytes written to \p
1157
+ * eventValueBuffer
1158
+ * \param eventValueBuffer Returns the event values
1159
+ * \param eventIdArraySizeBytes The size of \p eventIdArray in bytes,
1160
+ * and returns the number of bytes written to \p eventIdArray
1161
+ * \param eventIdArray Returns the IDs of the events in the same order
1162
+ * as the values return in eventValueBuffer.
1163
+ * \param numEventIdsRead Returns the number of event IDs returned
1164
+ * in \p eventIdArray
1165
+ *
1166
+ * \retval CUPTI_SUCCESS
1167
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
1168
+ * \retval CUPTI_ERROR_HARDWARE
1169
+ * \retval CUPTI_ERROR_INVALID_OPERATION if \p eventGroup is disabled
1170
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroup, \p
1171
+ * eventValueBufferSizeBytes, \p eventValueBuffer, \p
1172
+ * eventIdArraySizeBytes, \p eventIdArray or \p numEventIdsRead is
1173
+ * NULL
1174
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT if size of \p eventValueBuffer
1175
+ * or \p eventIdArray is not sufficient
1176
+ */
1177
+ CUptiResult CUPTIAPI cuptiEventGroupReadAllEvents(CUpti_EventGroup eventGroup,
1178
+ CUpti_ReadEventFlags flags,
1179
+ size_t *eventValueBufferSizeBytes,
1180
+ uint64_t *eventValueBuffer,
1181
+ size_t *eventIdArraySizeBytes,
1182
+ CUpti_EventID *eventIdArray,
1183
+ size_t *numEventIdsRead);
1184
+
1185
+ /**
1186
+ * \brief For a set of events, get the grouping that indicates the
1187
+ * number of passes and the event groups necessary to collect the
1188
+ * events.
1189
+ *
1190
+ * The number of events that can be collected simultaneously varies by
1191
+ * device and by the type of the events. When events can be collected
1192
+ * simultaneously, they may need to be grouped into multiple event
1193
+ * groups because they are from different event domains. This function
1194
+ * takes a set of events and determines how many passes are required
1195
+ * to collect all those events, and which events can be collected
1196
+ * simultaneously in each pass.
1197
+ *
1198
+ * The CUpti_EventGroupSets returned in \p eventGroupPasses indicates
1199
+ * how many passes are required to collect the events with the \p
1200
+ * numSets field. Within each event group set, the \p sets array
1201
+ * indicates the event groups that should be collected on each pass.
1202
+ * \note \b Thread-safety: this function is thread safe, but client
1203
+ * must guard against another thread simultaneously destroying \p
1204
+ * context.
1205
+ *
1206
+ * \param context The context for event collection
1207
+ * \param eventIdArraySizeBytes Size of \p eventIdArray in bytes
1208
+ * \param eventIdArray Array of event IDs that need to be grouped
1209
+ * \param eventGroupPasses Returns a CUpti_EventGroupSets object that
1210
+ * indicates the number of passes required to collect the events and
1211
+ * the events to collect on each pass
1212
+ *
1213
+ * \retval CUPTI_SUCCESS
1214
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
1215
+ * \retval CUPTI_ERROR_INVALID_CONTEXT
1216
+ * \retval CUPTI_ERROR_INVALID_EVENT_ID
1217
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventIdArray or
1218
+ * \p eventGroupPasses is NULL
1219
+ */
1220
+ CUptiResult CUPTIAPI cuptiEventGroupSetsCreate(CUcontext context,
1221
+ size_t eventIdArraySizeBytes,
1222
+ CUpti_EventID *eventIdArray,
1223
+ CUpti_EventGroupSets **eventGroupPasses);
1224
+
1225
+ /**
1226
+ * \brief Destroy a event group sets object.
1227
+ *
1228
+ * Destroy a CUpti_EventGroupSets object.
1229
+ * \note \b Thread-safety: this function is thread safe.
1230
+ *
1231
+ * \param eventGroupSets The object to destroy
1232
+ *
1233
+ * \retval CUPTI_SUCCESS
1234
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
1235
+ * \retval CUPTI_ERROR_INVALID_OPERATION if any of the event groups
1236
+ * contained in the sets is enabled
1237
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSets is NULL
1238
+ */
1239
+ CUptiResult CUPTIAPI cuptiEventGroupSetsDestroy(CUpti_EventGroupSets *eventGroupSets);
1240
+
1241
+
1242
+ /**
1243
+ * \brief Enable an event group set.
1244
+ *
1245
+ * Enable a set of event groups. Enabling a set of event groups zeros the value of
1246
+ * all the events in all the groups and then starts collection of those events.
1247
+ * \note \b Thread-safety: this function is thread safe.
1248
+ *
1249
+ * \param eventGroupSet The pointer to the event group set
1250
+ *
1251
+ * \retval CUPTI_SUCCESS
1252
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
1253
+ * \retval CUPTI_ERROR_HARDWARE
1254
+ * \retval CUPTI_ERROR_NOT_READY if \p eventGroup does not contain any events
1255
+ * \retval CUPTI_ERROR_NOT_COMPATIBLE if \p eventGroup cannot be
1256
+ * enabled due to other already enabled event groups
1257
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSet is NULL
1258
+ * \retval CUPTI_ERROR_HARDWARE_BUSY if other client is profiling and hardware is
1259
+ * busy
1260
+ */
1261
+ CUptiResult CUPTIAPI cuptiEventGroupSetEnable(CUpti_EventGroupSet *eventGroupSet);
1262
+
1263
+ /**
1264
+ * \brief Disable an event group set.
1265
+ *
1266
+ * Disable a set of event groups. Disabling a set of event groups
1267
+ * stops collection of events contained in the groups.
1268
+ * \note \b Thread-safety: this function is thread safe.
1269
+ * \note \b If this call fails, some of the event groups in the set may be disabled
1270
+ * and other event groups may remain enabled.
1271
+ *
1272
+ * \param eventGroupSet The pointer to the event group set
1273
+ * \retval CUPTI_SUCCESS
1274
+ * \retval CUPTI_ERROR_NOT_INITIALIZED
1275
+ * \retval CUPTI_ERROR_HARDWARE
1276
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p eventGroupSet is NULL
1277
+ */
1278
+ CUptiResult CUPTIAPI cuptiEventGroupSetDisable(CUpti_EventGroupSet *eventGroupSet);
1279
+
1280
+ /**
1281
+ * \brief Enable kernel replay mode.
1282
+ *
1283
+ * Set profiling mode for the context to replay mode. In this mode,
1284
+ * any number of events can be collected in one run of the kernel. The
1285
+ * event collection mode will automatically switch to
1286
+ * CUPTI_EVENT_COLLECTION_MODE_KERNEL. In this mode, \ref
1287
+ * cuptiSetEventCollectionMode will return
1288
+ * CUPTI_ERROR_INVALID_OPERATION.
1289
+ * \note \b Kernels might take longer to run if many events are enabled.
1290
+ * \note \b Thread-safety: this function is thread safe.
1291
+ *
1292
+ * \param context The context
1293
+ * \retval CUPTI_SUCCESS
1294
+ */
1295
+ CUptiResult CUPTIAPI cuptiEnableKernelReplayMode(CUcontext context);
1296
+
1297
+ /**
1298
+ * \brief Disable kernel replay mode.
1299
+ *
1300
+ * Set profiling mode for the context to non-replay (default)
1301
+ * mode. Event collection mode will be set to
1302
+ * CUPTI_EVENT_COLLECTION_MODE_KERNEL. All previously enabled
1303
+ * event groups and event group sets will be disabled.
1304
+ * \note \b Thread-safety: this function is thread safe.
1305
+ *
1306
+ * \param context The context
1307
+ * \retval CUPTI_SUCCESS
1308
+ */
1309
+ CUptiResult CUPTIAPI cuptiDisableKernelReplayMode(CUcontext context);
1310
+
1311
+ /**
1312
+ * \brief Function type for getting updates on kernel replay.
1313
+ *
1314
+ * \param kernelName The mangled kernel name
1315
+ * \param numReplaysDone Number of replays done so far
1316
+ * \param customData Pointer of any custom data passed in when subscribing
1317
+ */
1318
+ typedef void (CUPTIAPI *CUpti_KernelReplayUpdateFunc)(
1319
+ const char *kernelName,
1320
+ int numReplaysDone,
1321
+ void *customData);
1322
+
1323
+ /**
1324
+ * \brief Subscribe to kernel replay updates.
1325
+ *
1326
+ * When subscribed, the function pointer passed in will be called each time a
1327
+ * kernel run is finished during kernel replay. Previously subscribed function
1328
+ * pointer will be replaced. Pass in NULL as the function pointer unsubscribes
1329
+ * the update.
1330
+ *
1331
+ * \param updateFunc The update function pointer
1332
+ * \param customData Pointer to any custom data
1333
+ * \retval CUPTI_SUCCESS
1334
+ */
1335
+ CUptiResult CUPTIAPI cuptiKernelReplaySubscribeUpdate(CUpti_KernelReplayUpdateFunc updateFunc, void *customData);
1336
+
1337
+ /** @} */ /* END CUPTI_EVENT_API */
1338
+
1339
+ #if defined(__GNUC__) && defined(CUPTI_LIB)
1340
+ #pragma GCC visibility pop
1341
+ #endif
1342
+
1343
+ #if defined(__cplusplus)
1344
+ }
1345
+ #endif
1346
+
1347
+ #endif /*_CUPTI_EVENTS_H_*/
1348
+
1349
+
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_pcsampling.h ADDED
@@ -0,0 +1,936 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2020-2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(_CUPTI_PCSAMPLING_H_)
51
+ #define _CUPTI_PCSAMPLING_H_
52
+
53
+ #include <cuda.h>
54
+ #include <stdint.h>
55
+ #include <stddef.h>
56
+ #include "cupti_result.h"
57
+ #include "cupti_common.h"
58
+
59
+
60
+ #if defined(__cplusplus)
61
+ extern "C" {
62
+ #endif
63
+
64
+ #if defined(__GNUC__) && defined(CUPTI_LIB)
65
+ #pragma GCC visibility push(default)
66
+ #endif
67
+
68
+ /**
69
+ * \defgroup CUPTI_PCSAMPLING_API CUPTI PC Sampling API
70
+ * Functions, types, and enums that implement the CUPTI PC Sampling API.
71
+ * @{
72
+ */
73
+
74
+ #ifndef CUPTI_PCSAMPLING_STRUCT_SIZE
75
+ #define CUPTI_PCSAMPLING_STRUCT_SIZE(type_, lastfield_) (offsetof(type_, lastfield_) + sizeof(((type_*)0)->lastfield_))
76
+ #endif
77
+
78
+ #ifndef CUPTI_STALL_REASON_STRING_SIZE
79
+ #define CUPTI_STALL_REASON_STRING_SIZE 128
80
+ #endif
81
+
82
+ /**
83
+ * \brief PC Sampling collection mode
84
+ */
85
+ typedef enum
86
+ {
87
+ /**
88
+ * INVALID Value
89
+ */
90
+ CUPTI_PC_SAMPLING_COLLECTION_MODE_INVALID = 0,
91
+ /**
92
+ * Continuous mode. Kernels are not serialized in this mode.
93
+ */
94
+ CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS = 1,
95
+ /**
96
+ * Serialized mode. Kernels are serialized in this mode.
97
+ */
98
+ CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED = 2,
99
+ } CUpti_PCSamplingCollectionMode;
100
+
101
+ /**
102
+ * \brief PC Sampling stall reasons
103
+ */
104
+ typedef struct PACKED_ALIGNMENT
105
+ {
106
+ /**
107
+ * [r] Collected stall reason index
108
+ */
109
+ uint32_t pcSamplingStallReasonIndex;
110
+ /**
111
+ * [r] Number of times the PC was sampled with the stallReason.
112
+ */
113
+ uint32_t samples;
114
+ } CUpti_PCSamplingStallReason;
115
+
116
+ /**
117
+ * \brief PC Sampling data
118
+ */
119
+ typedef struct PACKED_ALIGNMENT
120
+ {
121
+ /**
122
+ * [w] Size of the data structure.
123
+ * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
124
+ * available in the structure. Used to preserve backward compatibility.
125
+ */
126
+ size_t size;
127
+ /**
128
+ * [r] Unique cubin id
129
+ */
130
+ uint64_t cubinCrc;
131
+ /**
132
+ * [r] PC offset
133
+ */
134
+ uint64_t pcOffset;
135
+ /**
136
+ * The function's unique symbol index in the module.
137
+ */
138
+ uint32_t functionIndex;
139
+ /**
140
+ * Padding
141
+ */
142
+ uint32_t pad;
143
+ /**
144
+ * [r] The function name. This name string might be shared across all the records
145
+ * including records from activity APIs representing the same function, and so it should not be
146
+ * modified or freed until post processing of all the records is done. Once done, it is user’s responsibility to
147
+ * free the memory using free() function.
148
+ */
149
+ char* functionName;
150
+ /**
151
+ * [r] Collected stall reason count
152
+ */
153
+ size_t stallReasonCount;
154
+ /**
155
+ * [r] Stall reason id
156
+ * Total samples
157
+ */
158
+ CUpti_PCSamplingStallReason *stallReason;
159
+ /**
160
+ * The correlation ID of the kernel to which this result is associated. Only valid for serialized mode of pc sampling collection.
161
+ * For continous mode of collection the correlationId will be set to 0.
162
+ */
163
+ uint32_t correlationId;
164
+ } CUpti_PCSamplingPCData;
165
+
166
+ /**
167
+ * \brief PC Sampling output data format
168
+ */
169
+ typedef enum
170
+ {
171
+ CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_INVALID = 0,
172
+ /**
173
+ * HW buffer data will be parsed during collection of data
174
+ */
175
+ CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED = 1,
176
+ } CUpti_PCSamplingOutputDataFormat;
177
+
178
+ /**
179
+ * \brief Collected PC Sampling data
180
+ *
181
+ */
182
+ typedef struct PACKED_ALIGNMENT
183
+ {
184
+ /**
185
+ * [w] Size of the data structure.
186
+ * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
187
+ * available in the structure. Used to preserve backward compatibility.
188
+ */
189
+ size_t size;
190
+ /**
191
+ * [w] Number of PCs to be collected
192
+ */
193
+ size_t collectNumPcs;
194
+ /**
195
+ * [r] Number of samples collected across all PCs.
196
+ * It includes samples for user modules, samples for non-user kernels and dropped samples.
197
+ * It includes counts for all non selected stall reasons.
198
+ * CUPTI does not provide PC records for non-user kernels.
199
+ * CUPTI does not provide PC records for instructions for which all selected stall reason metrics counts are zero.
200
+ */
201
+ uint64_t totalSamples;
202
+ /**
203
+ * [r] Number of samples that were dropped by hardware due to backpressure/overflow.
204
+ */
205
+ uint64_t droppedSamples;
206
+ /**
207
+ * [r] Number of PCs collected
208
+ */
209
+ size_t totalNumPcs;
210
+ /**
211
+ * [r] Number of PCs available for collection
212
+ */
213
+ size_t remainingNumPcs;
214
+ /**
215
+ * [r] Unique identifier for each range.
216
+ * Data collected across multiple ranges in multiple buffers can be identified using range id.
217
+ */
218
+ uint64_t rangeId;
219
+ /**
220
+ * [r] Profiled PC data
221
+ * This data struct should have enough memory to collect number of PCs mentioned in \brief collectNumPcs
222
+ */
223
+ CUpti_PCSamplingPCData *pPcData;
224
+ /**
225
+ * [r] Number of samples collected across all non user kernels PCs.
226
+ * It includes samples for non-user kernels.
227
+ * It includes counts for all non selected stall reasons as well.
228
+ * CUPTI does not provide PC records for non-user kernels.
229
+ */
230
+ uint64_t nonUsrKernelsTotalSamples;
231
+
232
+ /**
233
+ * [r] Status of the hardware buffer.
234
+ * CUPTI returns the error code CUPTI_ERROR_OUT_OF_MEMORY when hardware buffer is full.
235
+ * When hardware buffer is full, user will get pc data as 0. To mitigate this issue, one or more of the below options can be tried:
236
+ * 1. Increase the hardware buffer size using the attribute CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE
237
+ * 2. Decrease the thread sleep span using the attribute CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_WORKER_THREAD_PERIODIC_SLEEP_SPAN
238
+ * 3. Decrease the sampling frequency using the attribute CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD
239
+ */
240
+ uint8_t hardwareBufferFull;
241
+ } CUpti_PCSamplingData;
242
+
243
+ /**
244
+ * \brief PC Sampling configuration attributes
245
+ *
246
+ * PC Sampling configuration attribute types. These attributes can be read
247
+ * using \ref cuptiPCSamplingGetConfigurationAttribute and can be written
248
+ * using \ref cuptiPCSamplingSetConfigurationAttribute. Attributes marked
249
+ * [r] can only be read using \ref cuptiPCSamplingGetConfigurationAttribute
250
+ * [w] can only be written using \ref cuptiPCSamplingSetConfigurationAttribute
251
+ * [rw] can be read using \ref cuptiPCSamplingGetConfigurationAttribute and
252
+ * written using \ref cuptiPCSamplingSetConfigurationAttribute
253
+ */
254
+ typedef enum
255
+ {
256
+ CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_INVALID = 0,
257
+ /**
258
+ * [rw] Sampling period for PC Sampling.
259
+ * DEFAULT - CUPTI defined value based on number of SMs
260
+ * Valid values for the sampling
261
+ * periods are between 5 to 31 both inclusive. This will set the
262
+ * sampling period to (2^samplingPeriod) cycles.
263
+ * For e.g. for sampling period = 5 to 31, cycles = 32, 64, 128,..., 2^31
264
+ * Value is a uint32_t
265
+ */
266
+ CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD = 1,
267
+ /**
268
+ * [w] Number of stall reasons to collect.
269
+ * DEFAULT - All stall reasons will be collected
270
+ * Value is a size_t
271
+ * [w] Stall reasons to collect
272
+ * DEFAULT - All stall reasons will be collected
273
+ * Input value should be a pointer pointing to array of stall reason indexes
274
+ * containing all the stall reason indexes to collect.
275
+ */
276
+ CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON = 2,
277
+ /**
278
+ * [rw] Size of SW buffer for raw PC counter data downloaded from HW buffer
279
+ * DEFAULT - 1 MB, which can accommodate approximately 5500 PCs
280
+ * with all stall reasons
281
+ * Approximately it takes 16 Bytes (and some fixed size memory)
282
+ * to accommodate one PC with one stall reason
283
+ * For e.g. 1 PC with 1 stall reason = 32 Bytes
284
+ * 1 PC with 2 stall reason = 48 Bytes
285
+ * 1 PC with 4 stall reason = 96 Bytes
286
+ * Value is a size_t
287
+ */
288
+ CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE = 3,
289
+ /**
290
+ * [rw] Size of HW buffer in bytes
291
+ * DEFAULT - 512 MB
292
+ * If sampling period is too less, HW buffer can overflow
293
+ * and drop PC data
294
+ * Value is a size_t
295
+ */
296
+ CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE = 4,
297
+ /**
298
+ * [rw] PC Sampling collection mode
299
+ * DEFAULT - CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS
300
+ * Input value should be of type \ref CUpti_PCSamplingCollectionMode.
301
+ */
302
+ CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE = 5,
303
+ /**
304
+ * [rw] Control over PC Sampling data collection range
305
+ * Default - 0
306
+ * 1 - Allows user to start and stop PC Sampling using APIs -
307
+ * \ref cuptiPCSamplingStart() - Start PC Sampling
308
+ * \ref cuptiPCSamplingStop() - Stop PC Sampling
309
+ * Value is a uint32_t
310
+ */
311
+ CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL = 6,
312
+ /**
313
+ * [w] Value for output data format
314
+ * Default - CUPTI_PC_SAMPLING_OUTPUT_DATA_FORMAT_PARSED
315
+ * Input value should be of type \ref CUpti_PCSamplingOutputDataFormat.
316
+ */
317
+ CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT = 7,
318
+ /**
319
+ * [w] Data buffer to hold collected PC Sampling data PARSED_DATA
320
+ * Default - none.
321
+ * Buffer type is void * which can point to PARSED_DATA
322
+ * Refer \ref CUpti_PCSamplingData for buffer format for PARSED_DATA
323
+ */
324
+ CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER = 8,
325
+ /**
326
+ * [rw] Control sleep time of the worker threads created by CUPTI for various PC sampling operations.
327
+ * CUPTI creates multiple worker threads to offload certain operations to these threads. This includes decoding of HW data to
328
+ * the CUPTI PC sampling data and correlating PC data to SASS instructions. CUPTI wakes up these threads periodically.
329
+ * Default - 100 milliseconds.
330
+ * Value is a uint32_t
331
+ */
332
+ CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_WORKER_THREAD_PERIODIC_SLEEP_SPAN = 9,
333
+ CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_FORCE_INT = 0x7fffffff,
334
+ } CUpti_PCSamplingConfigurationAttributeType;
335
+
336
+ /**
337
+ * \brief PC sampling configuration information structure
338
+ *
339
+ * This structure provides \ref CUpti_PCSamplingConfigurationAttributeType which can be configured
340
+ * or queried for PC sampling configuration
341
+ */
342
+ typedef struct
343
+ {
344
+ /**
345
+ * Refer \ref CUpti_PCSamplingConfigurationAttributeType for all supported attribute types
346
+ */
347
+ CUpti_PCSamplingConfigurationAttributeType attributeType;
348
+ /*
349
+ * Configure or query status for \p attributeType
350
+ * CUPTI_SUCCESS for valid \p attributeType and \p attributeData
351
+ * CUPTI_ERROR_INVALID_OPERATION if \p attributeData is not valid
352
+ * CUPTI_ERROR_INVALID_PARAMETER if \p attributeType is not valid
353
+ */
354
+ CUptiResult attributeStatus;
355
+ union
356
+ {
357
+ /**
358
+ * Invalid Value
359
+ */
360
+ struct
361
+ {
362
+ uint64_t data[3];
363
+ } invalidData;
364
+ /**
365
+ * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_PERIOD
366
+ */
367
+ struct
368
+ {
369
+ uint32_t samplingPeriod;
370
+ } samplingPeriodData;
371
+ /**
372
+ * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_STALL_REASON
373
+ */
374
+ struct
375
+ {
376
+ size_t stallReasonCount;
377
+ uint32_t *pStallReasonIndex;
378
+ } stallReasonData;
379
+ /**
380
+ * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SCRATCH_BUFFER_SIZE
381
+ */
382
+ struct
383
+ {
384
+ size_t scratchBufferSize;
385
+ } scratchBufferSizeData;
386
+ /**
387
+ * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_HARDWARE_BUFFER_SIZE
388
+ */
389
+ struct
390
+ {
391
+ size_t hardwareBufferSize;
392
+ } hardwareBufferSizeData;
393
+ /**
394
+ * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_COLLECTION_MODE
395
+ */
396
+ struct
397
+ {
398
+ CUpti_PCSamplingCollectionMode collectionMode;
399
+ } collectionModeData;
400
+ /**
401
+ * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL
402
+ */
403
+ struct
404
+ {
405
+ uint32_t enableStartStopControl;
406
+ } enableStartStopControlData;
407
+ /**
408
+ * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_OUTPUT_DATA_FORMAT
409
+ */
410
+ struct
411
+ {
412
+ CUpti_PCSamplingOutputDataFormat outputDataFormat;
413
+ } outputDataFormatData;
414
+ /**
415
+ * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_SAMPLING_DATA_BUFFER
416
+ */
417
+ struct
418
+ {
419
+ void *samplingDataBuffer;
420
+ } samplingDataBufferData;
421
+ /**
422
+ * Refer \ref CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_WORKER_THREAD_PERIODIC_SLEEP_SPAN
423
+ */
424
+ struct
425
+ {
426
+ uint32_t workerThreadPeriodicSleepSpan;
427
+ } workerThreadPeriodicSleepSpanData;
428
+
429
+ } attributeData;
430
+ } CUpti_PCSamplingConfigurationInfo;
431
+
432
+ /**
433
+ * \brief PC sampling configuration structure
434
+ *
435
+ * This structure configures PC sampling using \ref cuptiPCSamplingSetConfigurationAttribute
436
+ * and queries PC sampling default configuration using \ref cuptiPCSamplingGetConfigurationAttribute
437
+ */
438
+ typedef struct
439
+ {
440
+ /**
441
+ * [w] Size of the data structure i.e. CUpti_PCSamplingConfigurationInfoParamsSize
442
+ * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
443
+ * available in the structure. Used to preserve backward compatibility.
444
+ */
445
+ size_t size;
446
+ /**
447
+ * [w] Assign to NULL
448
+ */
449
+ void* pPriv;
450
+ /**
451
+ * [w] CUcontext
452
+ */
453
+ CUcontext ctx;
454
+ /**
455
+ * [w] Number of attributes to configure using \ref cuptiPCSamplingSetConfigurationAttribute or query
456
+ * using \ref cuptiPCSamplingGetConfigurationAttribute
457
+ */
458
+ size_t numAttributes;
459
+ /**
460
+ * Refer \ref CUpti_PCSamplingConfigurationInfo
461
+ */
462
+ CUpti_PCSamplingConfigurationInfo *pPCSamplingConfigurationInfo;
463
+ } CUpti_PCSamplingConfigurationInfoParams;
464
+ #define CUpti_PCSamplingConfigurationInfoParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingConfigurationInfoParams,pPCSamplingConfigurationInfo)
465
+
466
+ /**
467
+ * \brief Write PC Sampling configuration attribute.
468
+ *
469
+ * \param pParams A pointer to \ref CUpti_PCSamplingConfigurationInfoParams
470
+ * containing PC sampling configuration.
471
+ *
472
+ * \retval CUPTI_SUCCESS
473
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
474
+ * some invalid \p attrib.
475
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if attribute \p value is not valid
476
+ * or any \p pParams is not valid
477
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
478
+ * does not support the API
479
+ */
480
+ CUptiResult CUPTIAPI cuptiPCSamplingSetConfigurationAttribute(CUpti_PCSamplingConfigurationInfoParams *pParams);
481
+
482
+ /**
483
+ * \brief Read PC Sampling configuration attribute.
484
+ *
485
+ * \param pParams A pointer to \ref CUpti_PCSamplingConfigurationInfoParams
486
+ * containing PC sampling configuration.
487
+ *
488
+ * \retval CUPTI_SUCCESS
489
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
490
+ * some invalid attribute.
491
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p attrib is not valid
492
+ * or any \p pParams is not valid
493
+ * \retval CUPTI_ERROR_PARAMETER_SIZE_NOT_SUFFICIENT indicates that
494
+ * the \p value buffer is too small to hold the attribute value
495
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
496
+ * does not support the API
497
+ */
498
+ CUptiResult CUPTIAPI cuptiPCSamplingGetConfigurationAttribute(CUpti_PCSamplingConfigurationInfoParams *pParams);
499
+
500
+ /**
501
+ * \brief Params for cuptiPCSamplingEnable
502
+ */
503
+ typedef struct
504
+ {
505
+ /**
506
+ * [w] Size of the data structure i.e. CUpti_PCSamplingGetDataParamsSize
507
+ * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
508
+ * available in the structure. Used to preserve backward compatibility.
509
+ */
510
+ size_t size;
511
+ /**
512
+ * [w] Assign to NULL
513
+ */
514
+ void* pPriv;
515
+ /**
516
+ * [w] CUcontext
517
+ */
518
+ CUcontext ctx;
519
+ /**
520
+ * \param pcSamplingData Data buffer to hold collected PC Sampling data PARSED_DATA
521
+ * Buffer type is void * which can point to PARSED_DATA
522
+ * Refer \ref CUpti_PCSamplingData for buffer format for PARSED_DATA
523
+ */
524
+ void *pcSamplingData;
525
+ } CUpti_PCSamplingGetDataParams;
526
+ #define CUpti_PCSamplingGetDataParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetDataParams, pcSamplingData)
527
+ /**
528
+ * \brief Flush GPU PC sampling data periodically.
529
+ *
530
+ * Flushing of GPU PC Sampling data is required at following point to maintain uniqueness of PCs:
531
+ * For \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS, after every module load-unload-load
532
+ * For \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_KERNEL_SERIALIZED, after every kernel ends
533
+ * If configuration option \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL
534
+ * is enabled, then after every range end i.e. \brief cuptiPCSamplingStop()
535
+ *
536
+ * If application is profiled in \brief CUPTI_PC_SAMPLING_COLLECTION_MODE_CONTINUOUS, with disabled
537
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL, and there is no module unload,
538
+ * user can collect data in two ways:
539
+ * Use \brief cuptiPCSamplingGetData() API periodically
540
+ * Use \brief cuptiPCSamplingDisable() on application exit and read GPU PC sampling data from sampling
541
+ * data buffer passed during configuration.
542
+ * Note: In case, \brief cuptiPCSamplingGetData() API is not called periodically, then sampling data buffer
543
+ * passed during configuration should be large enough to hold all PCs data.
544
+ * \brief cuptiPCSamplingGetData() API never does device synchronization.
545
+ * It is possible that when the API is called there is some unconsumed data from the HW buffer. In this case
546
+ * CUPTI provides only the data available with it at that moment.
547
+ *
548
+ * \param pParams A pointer to \ref CUpti_PCSamplingGetDataParams
549
+ *
550
+ * \retval CUPTI_SUCCESS
551
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called without
552
+ * enabling PC sampling.
553
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
554
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
555
+ * \retval CUPTI_ERROR_OUT_OF_MEMORY indicates that the HW buffer is full
556
+ * does not support the API
557
+ */
558
+ CUptiResult CUPTIAPI cuptiPCSamplingGetData(CUpti_PCSamplingGetDataParams *pParams);
559
+
560
+ /**
561
+ * \brief Params for cuptiPCSamplingEnable
562
+ */
563
+ typedef struct
564
+ {
565
+ /**
566
+ * [w] Size of the data structure i.e. CUpti_PCSamplingEnableParamsSize
567
+ * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
568
+ * available in the structure. Used to preserve backward compatibility.
569
+ */
570
+ size_t size;
571
+ /**
572
+ * [w] Assign to NULL
573
+ */
574
+ void* pPriv;
575
+ /**
576
+ * [w] CUcontext
577
+ */
578
+ CUcontext ctx;
579
+ } CUpti_PCSamplingEnableParams;
580
+ #define CUpti_PCSamplingEnableParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingEnableParams, ctx)
581
+
582
+ /**
583
+ * \brief Enable PC sampling.
584
+ *
585
+ * \param pParams A pointer to \ref CUpti_PCSamplingEnableParams
586
+ *
587
+ * \retval CUPTI_SUCCESS
588
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
589
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
590
+ * does not support the API
591
+ */
592
+ CUptiResult CUPTIAPI cuptiPCSamplingEnable(CUpti_PCSamplingEnableParams *pParams);
593
+
594
+ /**
595
+ * \brief Params for cuptiPCSamplingDisable
596
+ */
597
+ typedef struct
598
+ {
599
+ /**
600
+ * [w] Size of the data structure i.e. CUpti_PCSamplingDisableParamsSize
601
+ * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
602
+ * available in the structure. Used to preserve backward compatibility.
603
+ */
604
+ size_t size;
605
+ /**
606
+ * [w] Assign to NULL
607
+ */
608
+ void* pPriv;
609
+ /**
610
+ * [w] CUcontext
611
+ */
612
+ CUcontext ctx;
613
+ } CUpti_PCSamplingDisableParams;
614
+ #define CUpti_PCSamplingDisableParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingDisableParams, ctx)
615
+
616
+ /**
617
+ * \brief Disable PC sampling.
618
+ *
619
+ * For application which doesn't destroy the CUDA context explicitly,
620
+ * this API does the PC Sampling tear-down, joins threads and copies PC records in the buffer provided
621
+ * during the PC sampling configuration. PC records which can't be accommodated in the buffer are discarded.
622
+ *
623
+ * \param pParams A pointer to \ref CUpti_PCSamplingDisableParams
624
+ *
625
+ * \retval CUPTI_SUCCESS
626
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
627
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
628
+ * does not support the API
629
+ */
630
+ CUptiResult CUPTIAPI cuptiPCSamplingDisable(CUpti_PCSamplingDisableParams *pParams);
631
+
632
+ /**
633
+ * \brief Params for cuptiPCSamplingStart
634
+ */
635
+ typedef struct
636
+ {
637
+ /**
638
+ * [w] Size of the data structure i.e. CUpti_PCSamplingStartParamsSize
639
+ * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
640
+ * available in the structure. Used to preserve backward compatibility.
641
+ */
642
+ size_t size;
643
+ /**
644
+ * [w] Assign to NULL
645
+ */
646
+ void* pPriv;
647
+ /**
648
+ * [w] CUcontext
649
+ */
650
+ CUcontext ctx;
651
+ } CUpti_PCSamplingStartParams;
652
+ #define CUpti_PCSamplingStartParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingStartParams, ctx)
653
+
654
+ /**
655
+ * \brief Start PC sampling.
656
+ *
657
+ * User can collect PC Sampling data for user-defined range specified by Start/Stop APIs.
658
+ * This API can be used to mark starting of range. Set configuration option
659
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL to use this API.
660
+ *
661
+ * \param pParams A pointer to \ref CUpti_PCSamplingStartParams
662
+ *
663
+ * \retval CUPTI_SUCCESS
664
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
665
+ * incorrect PC Sampling configuration.
666
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
667
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
668
+ * does not support the API
669
+ */
670
+ CUptiResult CUPTIAPI cuptiPCSamplingStart(CUpti_PCSamplingStartParams *pParams);
671
+
672
+ /**
673
+ * \brief Params for cuptiPCSamplingStop
674
+ */
675
+ typedef struct
676
+ {
677
+ /**
678
+ * [w] Size of the data structure i.e. CUpti_PCSamplingStopParamsSize
679
+ * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
680
+ * available in the structure. Used to preserve backward compatibility.
681
+ */
682
+ size_t size;
683
+ /**
684
+ * [w] Assign to NULL
685
+ */
686
+ void* pPriv;
687
+ /**
688
+ * [w] CUcontext
689
+ */
690
+ CUcontext ctx;
691
+ } CUpti_PCSamplingStopParams;
692
+ #define CUpti_PCSamplingStopParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingStopParams, ctx)
693
+
694
+ /**
695
+ * \brief Stop PC sampling.
696
+ *
697
+ * User can collect PC Sampling data for user-defined range specified by Start/Stop APIs.
698
+ * This API can be used to mark end of range. Set configuration option
699
+ * \brief CUPTI_PC_SAMPLING_CONFIGURATION_ATTR_TYPE_ENABLE_START_STOP_CONTROL to use this API.
700
+ *
701
+ * \param pParams A pointer to \ref CUpti_PCSamplingStopParams
702
+ *
703
+ * \retval CUPTI_SUCCESS
704
+ * \retval CUPTI_ERROR_INVALID_OPERATION if this API is called with
705
+ * incorrect PC Sampling configuration.
706
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
707
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
708
+ * does not support the API
709
+ */
710
+ CUptiResult CUPTIAPI cuptiPCSamplingStop(CUpti_PCSamplingStopParams *pParams);
711
+
712
+ /**
713
+ * \brief Params for cuptiPCSamplingGetNumStallReasons
714
+ */
715
+ typedef struct
716
+ {
717
+ /**
718
+ * [w] Size of the data structure i.e. CUpti_PCSamplingGetNumStallReasonsParamsSize
719
+ * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
720
+ * available in the structure. Used to preserve backward compatibility.
721
+ */
722
+ size_t size;
723
+ /**
724
+ * [w] Assign to NULL
725
+ */
726
+ void* pPriv;
727
+ /**
728
+ * [w] CUcontext
729
+ */
730
+ CUcontext ctx;
731
+ /**
732
+ * [r] Number of stall reasons
733
+ */
734
+ size_t *numStallReasons;
735
+ } CUpti_PCSamplingGetNumStallReasonsParams;
736
+ #define CUpti_PCSamplingGetNumStallReasonsParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetNumStallReasonsParams, numStallReasons)
737
+
738
+ /**
739
+ * \brief Get PC sampling stall reason count.
740
+ *
741
+ * \param pParams A pointer to \ref CUpti_PCSamplingGetNumStallReasonsParams
742
+ *
743
+ * \retval CUPTI_SUCCESS
744
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
745
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
746
+ * does not support the API
747
+ */
748
+ CUptiResult CUPTIAPI cuptiPCSamplingGetNumStallReasons(CUpti_PCSamplingGetNumStallReasonsParams *pParams);
749
+
750
+ /**
751
+ * \brief Params for cuptiPCSamplingGetStallReasons
752
+ */
753
+ typedef struct
754
+ {
755
+ /**
756
+ * [w] Size of the data structure i.e. CUpti_PCSamplingGetStallReasonsParamsSize
757
+ * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
758
+ * available in the structure. Used to preserve backward compatibility.
759
+ */
760
+ size_t size;
761
+ /**
762
+ * [w] Assign to NULL
763
+ */
764
+ void* pPriv;
765
+ /**
766
+ * [w] CUcontext
767
+ */
768
+ CUcontext ctx;
769
+ /**
770
+ * [w] Number of stall reasons
771
+ */
772
+ size_t numStallReasons;
773
+ /**
774
+ * [r] Stall reason index
775
+ */
776
+ uint32_t *stallReasonIndex;
777
+ /**
778
+ * [r] Stall reasons name
779
+ */
780
+ char **stallReasons;
781
+ } CUpti_PCSamplingGetStallReasonsParams;
782
+ #define CUpti_PCSamplingGetStallReasonsParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_PCSamplingGetStallReasonsParams, stallReasons)
783
+
784
+ /**
785
+ * \brief Get PC sampling stall reasons.
786
+ *
787
+ * \param pParams A pointer to \ref CUpti_PCSamplingGetStallReasonsParams
788
+ *
789
+ * \retval CUPTI_SUCCESS
790
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if any \p pParams is not valid
791
+ * \retval CUPTI_ERROR_NOT_SUPPORTED indicates that the system/device
792
+ * does not support the API
793
+ */
794
+ CUptiResult CUPTIAPI cuptiPCSamplingGetStallReasons(CUpti_PCSamplingGetStallReasonsParams *pParams);
795
+
796
+
797
+ /**
798
+ * \brief Params for cuptiGetSassToSourceCorrelation
799
+ */
800
+ typedef struct CUpti_GetSassToSourceCorrelationParams {
801
+ /**
802
+ * [w] Size of the data structure i.e. CUpti_GetSassToSourceCorrelationParamsSize
803
+ * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
804
+ * available in the structure. Used to preserve backward compatibility.
805
+ */
806
+ size_t size;
807
+ /**
808
+ * [w] Pointer to cubin binary where function belongs.
809
+ */
810
+ const void* cubin;
811
+ /**
812
+ * [w] Function name to which PC belongs.
813
+ */
814
+ const char *functionName;
815
+ /**
816
+ * [w] Size of cubin binary.
817
+ */
818
+ size_t cubinSize;
819
+ /**
820
+ * [r] Line number in the source code.
821
+ */
822
+ uint32_t lineNumber;
823
+ /**
824
+ * [w] PC offset
825
+ */
826
+ uint64_t pcOffset;
827
+ /**
828
+ * [r] Path for the source file.
829
+ */
830
+ char *fileName;
831
+ /**
832
+ * [r] Path for the directory of source file.
833
+ */
834
+ char *dirName;
835
+ } CUpti_GetSassToSourceCorrelationParams;
836
+
837
+ #define CUpti_GetSassToSourceCorrelationParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_GetSassToSourceCorrelationParams, dirName)
838
+
839
+ /**
840
+ * \brief SASS to Source correlation.
841
+ *
842
+ * \param pParams A pointer to \ref CUpti_GetSassToSourceCorrelationParams
843
+ *
844
+ * It is expected from user to free allocated memory for fileName and dirName after use.
845
+ *
846
+ * \retval CUPTI_SUCCESS
847
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if either of the parameters cubin or functionName
848
+ * is NULL or cubinSize is zero or size field is not set correctly.
849
+ * \retval CUPTI_ERROR_INVALID_MODULE provided cubin is invalid.
850
+ * \retval CUPTI_ERROR_UNKNOWN an internal error occurred.
851
+ * This error code is also used for cases when the function is not present in the module.
852
+ * A better error code will be returned in the future release.
853
+ */
854
+ CUptiResult CUPTIAPI cuptiGetSassToSourceCorrelation(CUpti_GetSassToSourceCorrelationParams *pParams);
855
+
856
+ /**
857
+ * \brief Params for cuptiGetCubinCrc
858
+ */
859
+ typedef struct {
860
+ /**
861
+ * [w] Size of configuration structure.
862
+ * CUPTI client should set the size of the structure. It will be used in CUPTI to check what fields are
863
+ * available in the structure. Used to preserve backward compatibility.
864
+ */
865
+ size_t size;
866
+ /**
867
+ * [w] Size of cubin binary.
868
+ */
869
+ size_t cubinSize;
870
+ /**
871
+ * [w] Pointer to cubin binary
872
+ */
873
+ const void* cubin;
874
+ /**
875
+ * [r] Computed CRC will be stored in it.
876
+ */
877
+ uint64_t cubinCrc;
878
+ } CUpti_GetCubinCrcParams;
879
+ #define CUpti_GetCubinCrcParamsSize CUPTI_PCSAMPLING_STRUCT_SIZE(CUpti_GetCubinCrcParams, cubinCrc)
880
+
881
+ /**
882
+ * \brief Get the CRC of cubin.
883
+ *
884
+ * This function returns the CRC of provided cubin binary.
885
+ *
886
+ * \param pParams A pointer to \ref CUpti_GetCubinCrcParams
887
+ *
888
+ * \retval CUPTI_SUCCESS
889
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if parameter cubin is NULL or
890
+ * provided cubinSize is zero or size field is not set.
891
+ */
892
+ CUptiResult CUPTIAPI cuptiGetCubinCrc(CUpti_GetCubinCrcParams *pParams);
893
+
894
+ /**
895
+ * \brief Function type for callback used by CUPTI to request crc of
896
+ * loaded module.
897
+ *
898
+ * This callback function ask for crc of provided module in function.
899
+ * The provided crc will be stored in PC sampling records i.e. in the field 'cubinCrc' of the PC sampling
900
+ * struct CUpti_PCSamplingPCData. The CRC is uses during the offline source correlation to uniquely identify the module.
901
+ *
902
+ * \param cubin The pointer to cubin binary
903
+ * \param cubinSize The size of cubin binary.
904
+ * \param cubinCrc Returns the computed crc of cubin.
905
+ */
906
+ typedef void (CUPTIAPI *CUpti_ComputeCrcCallbackFunc)(
907
+ const void* cubin,
908
+ size_t cubinSize,
909
+ uint64_t *cubinCrc);
910
+
911
+ /**
912
+ * \brief Register callback function with CUPTI to use
913
+ * your own algorithm to compute cubin crc.
914
+ *
915
+ * This function registers a callback function and it gets called
916
+ * from CUPTI when a CUDA module is loaded.
917
+ *
918
+ * \param funcComputeCubinCrc callback is invoked when a CUDA module
919
+ * is loaded.
920
+ *
921
+ * \retval CUPTI_SUCCESS
922
+ * \retval CUPTI_ERROR_INVALID_PARAMETER if \p funcComputeCubinCrc is NULL.
923
+ */
924
+ CUptiResult CUPTIAPI cuptiRegisterComputeCrcCallback(CUpti_ComputeCrcCallbackFunc funcComputeCubinCrc);
925
+
926
+ /** @} */ /* END CUPTI_PCSAMPLING_API */
927
+
928
+ #if defined(__GNUC__) && defined(CUPTI_LIB)
929
+ #pragma GCC visibility pop
930
+ #endif
931
+
932
+ #if defined(__cplusplus)
933
+ }
934
+ #endif
935
+
936
+ #endif /*_CUPTI_PCSAMPLING_H_*/
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/cupti_runtime_cbid.h ADDED
@@ -0,0 +1,504 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ // *************************************************************************
3
+ // Definitions of indices for API functions, unique across entire API
4
+ // *************************************************************************
5
+
6
+ // This file is generated. Any changes you make will be lost during the next clean build.
7
+ // CUDA public interface, for type definitions and cu* function prototypes
8
+
9
+ #if !defined(_CUPTI_RUNTIME_CBID_H)
10
+ #define _CUPTI_RUNTIME_CBID_H
11
+
12
+ typedef enum CUpti_runtime_api_trace_cbid_enum {
13
+ CUPTI_RUNTIME_TRACE_CBID_INVALID = 0,
14
+ CUPTI_RUNTIME_TRACE_CBID_cudaDriverGetVersion_v3020 = 1,
15
+ CUPTI_RUNTIME_TRACE_CBID_cudaRuntimeGetVersion_v3020 = 2,
16
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceCount_v3020 = 3,
17
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v3020 = 4,
18
+ CUPTI_RUNTIME_TRACE_CBID_cudaChooseDevice_v3020 = 5,
19
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetChannelDesc_v3020 = 6,
20
+ CUPTI_RUNTIME_TRACE_CBID_cudaCreateChannelDesc_v3020 = 7,
21
+ CUPTI_RUNTIME_TRACE_CBID_cudaConfigureCall_v3020 = 8,
22
+ CUPTI_RUNTIME_TRACE_CBID_cudaSetupArgument_v3020 = 9,
23
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020 = 10,
24
+ CUPTI_RUNTIME_TRACE_CBID_cudaPeekAtLastError_v3020 = 11,
25
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorString_v3020 = 12,
26
+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020 = 13,
27
+ CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetCacheConfig_v3020 = 14,
28
+ CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetAttributes_v3020 = 15,
29
+ CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 = 16,
30
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 = 17,
31
+ CUPTI_RUNTIME_TRACE_CBID_cudaSetValidDevices_v3020 = 18,
32
+ CUPTI_RUNTIME_TRACE_CBID_cudaSetDeviceFlags_v3020 = 19,
33
+ CUPTI_RUNTIME_TRACE_CBID_cudaMalloc_v3020 = 20,
34
+ CUPTI_RUNTIME_TRACE_CBID_cudaMallocPitch_v3020 = 21,
35
+ CUPTI_RUNTIME_TRACE_CBID_cudaFree_v3020 = 22,
36
+ CUPTI_RUNTIME_TRACE_CBID_cudaMallocArray_v3020 = 23,
37
+ CUPTI_RUNTIME_TRACE_CBID_cudaFreeArray_v3020 = 24,
38
+ CUPTI_RUNTIME_TRACE_CBID_cudaMallocHost_v3020 = 25,
39
+ CUPTI_RUNTIME_TRACE_CBID_cudaFreeHost_v3020 = 26,
40
+ CUPTI_RUNTIME_TRACE_CBID_cudaHostAlloc_v3020 = 27,
41
+ CUPTI_RUNTIME_TRACE_CBID_cudaHostGetDevicePointer_v3020 = 28,
42
+ CUPTI_RUNTIME_TRACE_CBID_cudaHostGetFlags_v3020 = 29,
43
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemGetInfo_v3020 = 30,
44
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020 = 31,
45
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_v3020 = 32,
46
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_v3020 = 33,
47
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_v3020 = 34,
48
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_v3020 = 35,
49
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_v3020 = 36,
50
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_v3020 = 37,
51
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_v3020 = 38,
52
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_v3020 = 39,
53
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_v3020 = 40,
54
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020 = 41,
55
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_v3020 = 42,
56
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_v3020 = 43,
57
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_v3020 = 44,
58
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_v3020 = 45,
59
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_v3020 = 46,
60
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_v3020 = 47,
61
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_v3020 = 48,
62
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemset_v3020 = 49,
63
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_v3020 = 50,
64
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_v3020 = 51,
65
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_v3020 = 52,
66
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolAddress_v3020 = 53,
67
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetSymbolSize_v3020 = 54,
68
+ CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture_v3020 = 55,
69
+ CUPTI_RUNTIME_TRACE_CBID_cudaBindTexture2D_v3020 = 56,
70
+ CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToArray_v3020 = 57,
71
+ CUPTI_RUNTIME_TRACE_CBID_cudaUnbindTexture_v3020 = 58,
72
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureAlignmentOffset_v3020 = 59,
73
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureReference_v3020 = 60,
74
+ CUPTI_RUNTIME_TRACE_CBID_cudaBindSurfaceToArray_v3020 = 61,
75
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceReference_v3020 = 62,
76
+ CUPTI_RUNTIME_TRACE_CBID_cudaGLSetGLDevice_v3020 = 63,
77
+ CUPTI_RUNTIME_TRACE_CBID_cudaGLRegisterBufferObject_v3020 = 64,
78
+ CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObject_v3020 = 65,
79
+ CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObject_v3020 = 66,
80
+ CUPTI_RUNTIME_TRACE_CBID_cudaGLUnregisterBufferObject_v3020 = 67,
81
+ CUPTI_RUNTIME_TRACE_CBID_cudaGLSetBufferObjectMapFlags_v3020 = 68,
82
+ CUPTI_RUNTIME_TRACE_CBID_cudaGLMapBufferObjectAsync_v3020 = 69,
83
+ CUPTI_RUNTIME_TRACE_CBID_cudaGLUnmapBufferObjectAsync_v3020 = 70,
84
+ CUPTI_RUNTIME_TRACE_CBID_cudaWGLGetDevice_v3020 = 71,
85
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterImage_v3020 = 72,
86
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsGLRegisterBuffer_v3020 = 73,
87
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnregisterResource_v3020 = 74,
88
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceSetMapFlags_v3020 = 75,
89
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsMapResources_v3020 = 76,
90
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsUnmapResources_v3020 = 77,
91
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedPointer_v3020 = 78,
92
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsSubResourceGetMappedArray_v3020 = 79,
93
+ CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUGetDevice_v3020 = 80,
94
+ CUPTI_RUNTIME_TRACE_CBID_cudaVDPAUSetVDPAUDevice_v3020 = 81,
95
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterVideoSurface_v3020 = 82,
96
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsVDPAURegisterOutputSurface_v3020 = 83,
97
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevice_v3020 = 84,
98
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDevices_v3020 = 85,
99
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D11SetDirect3DDevice_v3020 = 86,
100
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D11RegisterResource_v3020 = 87,
101
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevice_v3020 = 88,
102
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDevices_v3020 = 89,
103
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10SetDirect3DDevice_v3020 = 90,
104
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D10RegisterResource_v3020 = 91,
105
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10RegisterResource_v3020 = 92,
106
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnregisterResource_v3020 = 93,
107
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10MapResources_v3020 = 94,
108
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10UnmapResources_v3020 = 95,
109
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceSetMapFlags_v3020 = 96,
110
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetSurfaceDimensions_v3020 = 97,
111
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedArray_v3020 = 98,
112
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPointer_v3020 = 99,
113
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedSize_v3020 = 100,
114
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10ResourceGetMappedPitch_v3020 = 101,
115
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevice_v3020 = 102,
116
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDevices_v3020 = 103,
117
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9SetDirect3DDevice_v3020 = 104,
118
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9GetDirect3DDevice_v3020 = 105,
119
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsD3D9RegisterResource_v3020 = 106,
120
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterResource_v3020 = 107,
121
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterResource_v3020 = 108,
122
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapResources_v3020 = 109,
123
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapResources_v3020 = 110,
124
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceSetMapFlags_v3020 = 111,
125
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetSurfaceDimensions_v3020 = 112,
126
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedArray_v3020 = 113,
127
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPointer_v3020 = 114,
128
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedSize_v3020 = 115,
129
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9ResourceGetMappedPitch_v3020 = 116,
130
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9Begin_v3020 = 117,
131
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9End_v3020 = 118,
132
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9RegisterVertexBuffer_v3020 = 119,
133
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnregisterVertexBuffer_v3020 = 120,
134
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9MapVertexBuffer_v3020 = 121,
135
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D9UnmapVertexBuffer_v3020 = 122,
136
+ CUPTI_RUNTIME_TRACE_CBID_cudaThreadExit_v3020 = 123,
137
+ CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForDevice_v3020 = 124,
138
+ CUPTI_RUNTIME_TRACE_CBID_cudaSetDoubleForHost_v3020 = 125,
139
+ CUPTI_RUNTIME_TRACE_CBID_cudaThreadSynchronize_v3020 = 126,
140
+ CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetLimit_v3020 = 127,
141
+ CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetLimit_v3020 = 128,
142
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreate_v3020 = 129,
143
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v3020 = 130,
144
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_v3020 = 131,
145
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_v3020 = 132,
146
+ CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020 = 133,
147
+ CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020 = 134,
148
+ CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_v3020 = 135,
149
+ CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020 = 136,
150
+ CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020 = 137,
151
+ CUPTI_RUNTIME_TRACE_CBID_cudaEventQuery_v3020 = 138,
152
+ CUPTI_RUNTIME_TRACE_CBID_cudaEventElapsedTime_v3020 = 139,
153
+ CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3D_v3020 = 140,
154
+ CUPTI_RUNTIME_TRACE_CBID_cudaMalloc3DArray_v3020 = 141,
155
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_v3020 = 142,
156
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_v3020 = 143,
157
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_v3020 = 144,
158
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_v3020 = 145,
159
+ CUPTI_RUNTIME_TRACE_CBID_cudaThreadSetCacheConfig_v3020 = 146,
160
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_v3020 = 147,
161
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D11GetDirect3DDevice_v3020 = 148,
162
+ CUPTI_RUNTIME_TRACE_CBID_cudaD3D10GetDirect3DDevice_v3020 = 149,
163
+ CUPTI_RUNTIME_TRACE_CBID_cudaThreadGetCacheConfig_v3020 = 150,
164
+ CUPTI_RUNTIME_TRACE_CBID_cudaPointerGetAttributes_v4000 = 151,
165
+ CUPTI_RUNTIME_TRACE_CBID_cudaHostRegister_v4000 = 152,
166
+ CUPTI_RUNTIME_TRACE_CBID_cudaHostUnregister_v4000 = 153,
167
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceCanAccessPeer_v4000 = 154,
168
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceEnablePeerAccess_v4000 = 155,
169
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceDisablePeerAccess_v4000 = 156,
170
+ CUPTI_RUNTIME_TRACE_CBID_cudaPeerRegister_v4000 = 157,
171
+ CUPTI_RUNTIME_TRACE_CBID_cudaPeerUnregister_v4000 = 158,
172
+ CUPTI_RUNTIME_TRACE_CBID_cudaPeerGetDevicePointer_v4000 = 159,
173
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeer_v4000 = 160,
174
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyPeerAsync_v4000 = 161,
175
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_v4000 = 162,
176
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_v4000 = 163,
177
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceReset_v3020 = 164,
178
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSynchronize_v3020 = 165,
179
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetLimit_v3020 = 166,
180
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetLimit_v3020 = 167,
181
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetCacheConfig_v3020 = 168,
182
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetCacheConfig_v3020 = 169,
183
+ CUPTI_RUNTIME_TRACE_CBID_cudaProfilerInitialize_v4000 = 170,
184
+ CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStart_v4000 = 171,
185
+ CUPTI_RUNTIME_TRACE_CBID_cudaProfilerStop_v4000 = 172,
186
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetByPCIBusId_v4010 = 173,
187
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetPCIBusId_v4010 = 174,
188
+ CUPTI_RUNTIME_TRACE_CBID_cudaGLGetDevices_v4010 = 175,
189
+ CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetEventHandle_v4010 = 176,
190
+ CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenEventHandle_v4010 = 177,
191
+ CUPTI_RUNTIME_TRACE_CBID_cudaIpcGetMemHandle_v4010 = 178,
192
+ CUPTI_RUNTIME_TRACE_CBID_cudaIpcOpenMemHandle_v4010 = 179,
193
+ CUPTI_RUNTIME_TRACE_CBID_cudaIpcCloseMemHandle_v4010 = 180,
194
+ CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetInfo_v4010 = 181,
195
+ CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetSharedMemConfig_v4020 = 182,
196
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetSharedMemConfig_v4020 = 183,
197
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetSharedMemConfig_v4020 = 184,
198
+ CUPTI_RUNTIME_TRACE_CBID_cudaCreateTextureObject_v5000 = 185,
199
+ CUPTI_RUNTIME_TRACE_CBID_cudaDestroyTextureObject_v5000 = 186,
200
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceDesc_v5000 = 187,
201
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectTextureDesc_v5000 = 188,
202
+ CUPTI_RUNTIME_TRACE_CBID_cudaCreateSurfaceObject_v5000 = 189,
203
+ CUPTI_RUNTIME_TRACE_CBID_cudaDestroySurfaceObject_v5000 = 190,
204
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetSurfaceObjectResourceDesc_v5000 = 191,
205
+ CUPTI_RUNTIME_TRACE_CBID_cudaMallocMipmappedArray_v5000 = 192,
206
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetMipmappedArrayLevel_v5000 = 193,
207
+ CUPTI_RUNTIME_TRACE_CBID_cudaFreeMipmappedArray_v5000 = 194,
208
+ CUPTI_RUNTIME_TRACE_CBID_cudaBindTextureToMipmappedArray_v5000 = 195,
209
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedMipmappedArray_v5000 = 196,
210
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_v5000 = 197,
211
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithFlags_v5000 = 198,
212
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectResourceViewDesc_v5000 = 199,
213
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetAttribute_v5000 = 200,
214
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamDestroy_v5050 = 201,
215
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamCreateWithPriority_v5050 = 202,
216
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_v5050 = 203,
217
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_v5050 = 204,
218
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetStreamPriorityRange_v5050 = 205,
219
+ CUPTI_RUNTIME_TRACE_CBID_cudaMallocManaged_v6000 = 206,
220
+ CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6000 = 207,
221
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_v6000 = 208,
222
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetErrorName_v6050 = 209,
223
+ CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessor_v6050 = 210,
224
+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000 = 211,
225
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceFlags_v7000 = 212,
226
+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_ptsz_v7000 = 213,
227
+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_ptsz_v7000 = 214,
228
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_ptds_v7000 = 215,
229
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2D_ptds_v7000 = 216,
230
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArray_ptds_v7000 = 217,
231
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArray_ptds_v7000 = 218,
232
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArray_ptds_v7000 = 219,
233
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArray_ptds_v7000 = 220,
234
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyArrayToArray_ptds_v7000 = 221,
235
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DArrayToArray_ptds_v7000 = 222,
236
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbol_ptds_v7000 = 223,
237
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbol_ptds_v7000 = 224,
238
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_ptsz_v7000 = 225,
239
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToArrayAsync_ptsz_v7000 = 226,
240
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromArrayAsync_ptsz_v7000 = 227,
241
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DAsync_ptsz_v7000 = 228,
242
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DToArrayAsync_ptsz_v7000 = 229,
243
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy2DFromArrayAsync_ptsz_v7000 = 230,
244
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyToSymbolAsync_ptsz_v7000 = 231,
245
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyFromSymbolAsync_ptsz_v7000 = 232,
246
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemset_ptds_v7000 = 233,
247
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemset2D_ptds_v7000 = 234,
248
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemsetAsync_ptsz_v7000 = 235,
249
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemset2DAsync_ptsz_v7000 = 236,
250
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetPriority_ptsz_v7000 = 237,
251
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetFlags_ptsz_v7000 = 238,
252
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamSynchronize_ptsz_v7000 = 239,
253
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamQuery_ptsz_v7000 = 240,
254
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamAttachMemAsync_ptsz_v7000 = 241,
255
+ CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_ptsz_v7000 = 242,
256
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemset3D_ptds_v7000 = 243,
257
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemset3DAsync_ptsz_v7000 = 244,
258
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3D_ptds_v7000 = 245,
259
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DAsync_ptsz_v7000 = 246,
260
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamWaitEvent_ptsz_v7000 = 247,
261
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamAddCallback_ptsz_v7000 = 248,
262
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeer_ptds_v7000 = 249,
263
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DPeerAsync_ptsz_v7000 = 250,
264
+ CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000 = 251,
265
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v8000 = 252,
266
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_ptsz_v8000 = 253,
267
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemAdvise_v8000 = 254,
268
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetP2PAttribute_v8000 = 255,
269
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsEGLRegisterImage_v7000 = 256,
270
+ CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnect_v7000 = 257,
271
+ CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerDisconnect_v7000 = 258,
272
+ CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerAcquireFrame_v7000 = 259,
273
+ CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerReleaseFrame_v7000 = 260,
274
+ CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerConnect_v7000 = 261,
275
+ CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerDisconnect_v7000 = 262,
276
+ CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerPresentFrame_v7000 = 263,
277
+ CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamProducerReturnFrame_v7000 = 264,
278
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphicsResourceGetMappedEglFrame_v7000 = 265,
279
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttribute_v8000 = 266,
280
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemRangeGetAttributes_v8000 = 267,
281
+ CUPTI_RUNTIME_TRACE_CBID_cudaEGLStreamConsumerConnectWithFlags_v7000 = 268,
282
+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000 = 269,
283
+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_ptsz_v9000 = 270,
284
+ CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateFromEGLSync_v9000 = 271,
285
+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000 = 272,
286
+ CUPTI_RUNTIME_TRACE_CBID_cudaFuncSetAttribute_v9000 = 273,
287
+ CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalMemory_v10000 = 274,
288
+ CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedBuffer_v10000 = 275,
289
+ CUPTI_RUNTIME_TRACE_CBID_cudaExternalMemoryGetMappedMipmappedArray_v10000 = 276,
290
+ CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalMemory_v10000 = 277,
291
+ CUPTI_RUNTIME_TRACE_CBID_cudaImportExternalSemaphore_v10000 = 278,
292
+ CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v10000 = 279,
293
+ CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_ptsz_v10000 = 280,
294
+ CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v10000 = 281,
295
+ CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_ptsz_v10000 = 282,
296
+ CUPTI_RUNTIME_TRACE_CBID_cudaDestroyExternalSemaphore_v10000 = 283,
297
+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_v10000 = 284,
298
+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchHostFunc_ptsz_v10000 = 285,
299
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphCreate_v10000 = 286,
300
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetParams_v10000 = 287,
301
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetParams_v10000 = 288,
302
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddKernelNode_v10000 = 289,
303
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode_v10000 = 290,
304
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeGetParams_v10000 = 291,
305
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams_v10000 = 292,
306
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemsetNode_v10000 = 293,
307
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeGetParams_v10000 = 294,
308
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemsetNodeSetParams_v10000 = 295,
309
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddHostNode_v10000 = 296,
310
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeGetParams_v10000 = 297,
311
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddChildGraphNode_v10000 = 298,
312
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphChildGraphNodeGetGraph_v10000 = 299,
313
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEmptyNode_v10000 = 300,
314
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphClone_v10000 = 301,
315
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeFindInClone_v10000 = 302,
316
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetType_v10000 = 303,
317
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetRootNodes_v10000 = 304,
318
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependencies_v10000 = 305,
319
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependentNodes_v10000 = 306,
320
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddDependencies_v10000 = 307,
321
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphRemoveDependencies_v10000 = 308,
322
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroyNode_v10000 = 309,
323
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiate_v10000 = 310,
324
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_v10000 = 311,
325
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphLaunch_ptsz_v10000 = 312,
326
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecDestroy_v10000 = 313,
327
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphDestroy_v10000 = 314,
328
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_v10000 = 315,
329
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCapture_ptsz_v10000 = 316,
330
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_v10000 = 317,
331
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamIsCapturing_ptsz_v10000 = 318,
332
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_v10000 = 319,
333
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamEndCapture_ptsz_v10000 = 320,
334
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphHostNodeSetParams_v10000 = 321,
335
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetNodes_v10000 = 322,
336
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetEdges_v10000 = 323,
337
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v10010 = 324,
338
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_ptsz_v10010 = 325,
339
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecKernelNodeSetParams_v10010 = 326,
340
+ CUPTI_RUNTIME_TRACE_CBID_cudaThreadExchangeStreamCaptureMode_v10010 = 327,
341
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetNvSciSyncAttributes_v10020 = 328,
342
+ CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyAvailableDynamicSMemPerBlock_v10200 = 329,
343
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_v10200 = 330,
344
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetFlags_ptsz_v10200 = 331,
345
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams_v10020 = 332,
346
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemsetNodeSetParams_v10020 = 333,
347
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecHostNodeSetParams_v10020 = 334,
348
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecUpdate_v10020 = 335,
349
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetFuncBySymbol_v11000 = 336,
350
+ CUPTI_RUNTIME_TRACE_CBID_cudaCtxResetPersistingL2Cache_v11000 = 337,
351
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeCopyAttributes_v11000 = 338,
352
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeGetAttribute_v11000 = 339,
353
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphKernelNodeSetAttribute_v11000 = 340,
354
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_v11000 = 341,
355
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamCopyAttributes_ptsz_v11000 = 342,
356
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_v11000 = 343,
357
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetAttribute_ptsz_v11000 = 344,
358
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_v11000 = 345,
359
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamSetAttribute_ptsz_v11000 = 346,
360
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetTexture1DLinearMaxWidth_v11010 = 347,
361
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_v10000 = 348,
362
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphUpload_ptsz_v10000 = 349,
363
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeToSymbol_v11010 = 350,
364
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNodeFromSymbol_v11010 = 351,
365
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemcpyNode1D_v11010 = 352,
366
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsToSymbol_v11010 = 353,
367
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParamsFromSymbol_v11010 = 354,
368
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemcpyNodeSetParams1D_v11010 = 355,
369
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsToSymbol_v11010 = 356,
370
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParamsFromSymbol_v11010 = 357,
371
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecMemcpyNodeSetParams1D_v11010 = 358,
372
+ CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetSparseProperties_v11010 = 359,
373
+ CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetSparseProperties_v11010 = 360,
374
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecChildGraphNodeSetParams_v11010 = 361,
375
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventRecordNode_v11010 = 362,
376
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeGetEvent_v11010 = 363,
377
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventRecordNodeSetEvent_v11010 = 364,
378
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddEventWaitNode_v11010 = 365,
379
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeGetEvent_v11010 = 366,
380
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphEventWaitNodeSetEvent_v11010 = 367,
381
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventRecordNodeSetEvent_v11010 = 368,
382
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecEventWaitNodeSetEvent_v11010 = 369,
383
+ CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_v11010 = 370,
384
+ CUPTI_RUNTIME_TRACE_CBID_cudaEventRecordWithFlags_ptsz_v11010 = 371,
385
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetDefaultMemPool_v11020 = 372,
386
+ CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_v11020 = 373,
387
+ CUPTI_RUNTIME_TRACE_CBID_cudaMallocAsync_ptsz_v11020 = 374,
388
+ CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_v11020 = 375,
389
+ CUPTI_RUNTIME_TRACE_CBID_cudaFreeAsync_ptsz_v11020 = 376,
390
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolTrimTo_v11020 = 377,
391
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAttribute_v11020 = 378,
392
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAttribute_v11020 = 379,
393
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolSetAccess_v11020 = 380,
394
+ CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetPlane_v11020 = 381,
395
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolGetAccess_v11020 = 382,
396
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolCreate_v11020 = 383,
397
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolDestroy_v11020 = 384,
398
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetMemPool_v11020 = 385,
399
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetMemPool_v11020 = 386,
400
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportToShareableHandle_v11020 = 387,
401
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportFromShareableHandle_v11020 = 388,
402
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolExportPointer_v11020 = 389,
403
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPoolImportPointer_v11020 = 390,
404
+ CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_v11020 = 391,
405
+ CUPTI_RUNTIME_TRACE_CBID_cudaMallocFromPoolAsync_ptsz_v11020 = 392,
406
+ CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_v11020 = 393,
407
+ CUPTI_RUNTIME_TRACE_CBID_cudaSignalExternalSemaphoresAsync_v2_ptsz_v11020 = 394,
408
+ CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_v11020 = 395,
409
+ CUPTI_RUNTIME_TRACE_CBID_cudaWaitExternalSemaphoresAsync_v2_ptsz_v11020 = 396,
410
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresSignalNode_v11020 = 397,
411
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeGetParams_v11020 = 398,
412
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresSignalNodeSetParams_v11020 = 399,
413
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddExternalSemaphoresWaitNode_v11020 = 400,
414
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeGetParams_v11020 = 401,
415
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExternalSemaphoresWaitNodeSetParams_v11020 = 402,
416
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresSignalNodeSetParams_v11020 = 403,
417
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecExternalSemaphoresWaitNodeSetParams_v11020 = 404,
418
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceFlushGPUDirectRDMAWrites_v11030 = 405,
419
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_v11030 = 406,
420
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPoint_ptsz_v11030 = 407,
421
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphDebugDotPrint_v11030 = 408,
422
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_v11030 = 409,
423
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v2_ptsz_v11030 = 410,
424
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v11030 = 411,
425
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_ptsz_v11030 = 412,
426
+ CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectCreate_v11030 = 413,
427
+ CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRetain_v11030 = 414,
428
+ CUPTI_RUNTIME_TRACE_CBID_cudaUserObjectRelease_v11030 = 415,
429
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphRetainUserObject_v11030 = 416,
430
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphReleaseUserObject_v11030 = 417,
431
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithFlags_v11040 = 418,
432
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemAllocNode_v11040 = 419,
433
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemAllocNodeGetParams_v11040 = 420,
434
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddMemFreeNode_v11040 = 421,
435
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphMemFreeNodeGetParams_v11040 = 422,
436
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGraphMemTrim_v11040 = 423,
437
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceGetGraphMemAttribute_v11040 = 424,
438
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceSetGraphMemAttribute_v11040 = 425,
439
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeSetEnabled_v11060 = 426,
440
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetEnabled_v11060 = 427,
441
+ CUPTI_RUNTIME_TRACE_CBID_cudaArrayGetMemoryRequirements_v11060 = 428,
442
+ CUPTI_RUNTIME_TRACE_CBID_cudaMipmappedArrayGetMemoryRequirements_v11060 = 429,
443
+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_v11060 = 430,
444
+ CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernelExC_ptsz_v11060 = 431,
445
+ CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxPotentialClusterSize_v11070 = 432,
446
+ CUPTI_RUNTIME_TRACE_CBID_cudaOccupancyMaxActiveClusters_v11070 = 433,
447
+ CUPTI_RUNTIME_TRACE_CBID_cudaCreateTextureObject_v2_v11080 = 434,
448
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetTextureObjectTextureDesc_v2_v11080 = 435,
449
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithParams_v12000 = 436,
450
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiateWithParams_ptsz_v12000 = 437,
451
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecGetFlags_v12000 = 438,
452
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetKernel_v12000 = 439,
453
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetDeviceProperties_v2_v12000 = 440,
454
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetId_v12000 = 441,
455
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetId_ptsz_v12000 = 442,
456
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphInstantiate_v12000 = 443,
457
+ CUPTI_RUNTIME_TRACE_CBID_cudaInitDevice_v12000 = 444,
458
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddNode_v12020 = 445,
459
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeSetParams_v12020 = 446,
460
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphExecNodeSetParams_v12020 = 447,
461
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemAdvise_v2_v12020 = 448,
462
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v2_v12020 = 449,
463
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemPrefetchAsync_v2_ptsz_v12020 = 450,
464
+ CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetName_v12030 = 451,
465
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCaptureToGraph_v12030 = 452,
466
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamBeginCaptureToGraph_ptsz_v12030 = 453,
467
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphConditionalHandleCreate_v12030 = 454,
468
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphGetEdges_v2_v12030 = 455,
469
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependencies_v2_v12030 = 456,
470
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphNodeGetDependentNodes_v2_v12030 = 457,
471
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddDependencies_v2_v12030 = 458,
472
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphRemoveDependencies_v2_v12030 = 459,
473
+ CUPTI_RUNTIME_TRACE_CBID_cudaGraphAddNode_v2_v12030 = 460,
474
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v3_v12030 = 461,
475
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetCaptureInfo_v3_ptsz_v12030 = 462,
476
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v2_v12030 = 463,
477
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamUpdateCaptureDependencies_v2_ptsz_v12030 = 464,
478
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceRegisterAsyncNotification_v12040 = 465,
479
+ CUPTI_RUNTIME_TRACE_CBID_cudaDeviceUnregisterAsyncNotification_v12040 = 466,
480
+ CUPTI_RUNTIME_TRACE_CBID_cudaFuncGetParamInfo_v12040 = 467,
481
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPointByVersion_v12050 = 468,
482
+ CUPTI_RUNTIME_TRACE_CBID_cudaGetDriverEntryPointByVersion_ptsz_v12050 = 469,
483
+ CUPTI_RUNTIME_TRACE_CBID_cuda470_v12060 = 470,
484
+ CUPTI_RUNTIME_TRACE_CBID_cuda471_v12060 = 471,
485
+ CUPTI_RUNTIME_TRACE_CBID_cuda472_v12060 = 472,
486
+ CUPTI_RUNTIME_TRACE_CBID_cuda473_v12060 = 473,
487
+ CUPTI_RUNTIME_TRACE_CBID_cuda474_v12060 = 474,
488
+ CUPTI_RUNTIME_TRACE_CBID_cuda475_v12060 = 475,
489
+ CUPTI_RUNTIME_TRACE_CBID_cuda476_v12060 = 476,
490
+ CUPTI_RUNTIME_TRACE_CBID_cuda477_v12060 = 477,
491
+ CUPTI_RUNTIME_TRACE_CBID_cuda478_v12060 = 478,
492
+ CUPTI_RUNTIME_TRACE_CBID_cuda479_v12060 = 479,
493
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetDevice_v12080 = 480,
494
+ CUPTI_RUNTIME_TRACE_CBID_cudaStreamGetDevice_ptsz_v12080 = 481,
495
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyBatchAsync_v12080 = 482,
496
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyBatchAsync_ptsz_v12080 = 483,
497
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DBatchAsync_v12080 = 484,
498
+ CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy3DBatchAsync_ptsz_v12080 = 485,
499
+ CUPTI_RUNTIME_TRACE_CBID_cudaEventElapsedTime_v2_v12080 = 486,
500
+ CUPTI_RUNTIME_TRACE_CBID_SIZE = 487,
501
+ CUPTI_RUNTIME_TRACE_CBID_FORCE_INT = 0x7fffffff
502
+ } CUpti_runtime_api_trace_cbid;
503
+
504
+ #endif
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/device_functions.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("device_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "device_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
58
+ #endif
59
+
60
+ #include "crt/device_functions.h"
61
+
62
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__)
63
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
64
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_FUNCTIONS_H_WRAPPER__
65
+ #endif
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/library_types.h ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__LIBRARY_TYPES_H__)
51
+ #define __LIBRARY_TYPES_H__
52
+
53
+
54
+
55
+ #ifndef __CUDACC_RTC_MINIMAL__
56
+
57
+ typedef enum cudaDataType_t
58
+ {
59
+ CUDA_R_16F = 2, /* real as a half */
60
+ CUDA_C_16F = 6, /* complex as a pair of half numbers */
61
+ CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
62
+ CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
63
+ CUDA_R_32F = 0, /* real as a float */
64
+ CUDA_C_32F = 4, /* complex as a pair of float numbers */
65
+ CUDA_R_64F = 1, /* real as a double */
66
+ CUDA_C_64F = 5, /* complex as a pair of double numbers */
67
+ CUDA_R_4I = 16, /* real as a signed 4-bit int */
68
+ CUDA_C_4I = 17, /* complex as a pair of signed 4-bit int numbers */
69
+ CUDA_R_4U = 18, /* real as a unsigned 4-bit int */
70
+ CUDA_C_4U = 19, /* complex as a pair of unsigned 4-bit int numbers */
71
+ CUDA_R_8I = 3, /* real as a signed 8-bit int */
72
+ CUDA_C_8I = 7, /* complex as a pair of signed 8-bit int numbers */
73
+ CUDA_R_8U = 8, /* real as a unsigned 8-bit int */
74
+ CUDA_C_8U = 9, /* complex as a pair of unsigned 8-bit int numbers */
75
+ CUDA_R_16I = 20, /* real as a signed 16-bit int */
76
+ CUDA_C_16I = 21, /* complex as a pair of signed 16-bit int numbers */
77
+ CUDA_R_16U = 22, /* real as a unsigned 16-bit int */
78
+ CUDA_C_16U = 23, /* complex as a pair of unsigned 16-bit int numbers */
79
+ CUDA_R_32I = 10, /* real as a signed 32-bit int */
80
+ CUDA_C_32I = 11, /* complex as a pair of signed 32-bit int numbers */
81
+ CUDA_R_32U = 12, /* real as a unsigned 32-bit int */
82
+ CUDA_C_32U = 13, /* complex as a pair of unsigned 32-bit int numbers */
83
+ CUDA_R_64I = 24, /* real as a signed 64-bit int */
84
+ CUDA_C_64I = 25, /* complex as a pair of signed 64-bit int numbers */
85
+ CUDA_R_64U = 26, /* real as a unsigned 64-bit int */
86
+ CUDA_C_64U = 27, /* complex as a pair of unsigned 64-bit int numbers */
87
+ CUDA_R_8F_E4M3 = 28, /* real as a nv_fp8_e4m3 */
88
+ CUDA_R_8F_UE4M3 = CUDA_R_8F_E4M3, /* real as an unsigned nv_fp8_e4m3 */
89
+ CUDA_R_8F_E5M2 = 29, /* real as a nv_fp8_e5m2 */
90
+ CUDA_R_8F_UE8M0 = 30, /* real as an exponent-only unsigned nv_fp8_e8m0 */
91
+ CUDA_R_6F_E2M3 = 31, /* real as a nv_fp6_e2m3 */
92
+ CUDA_R_6F_E3M2 = 32, /* real as a nv_fp6_e3m2 */
93
+ CUDA_R_4F_E2M1 = 33, /* real as a nv_fp4_e2m1 */
94
+ } cudaDataType;
95
+
96
+
97
+ typedef enum libraryPropertyType_t
98
+ {
99
+ MAJOR_VERSION,
100
+ MINOR_VERSION,
101
+ PATCH_LEVEL
102
+ } libraryPropertyType;
103
+
104
+
105
+ #ifndef __cplusplus
106
+ typedef enum cudaDataType_t cudaDataType_t;
107
+ typedef enum libraryPropertyType_t libraryPropertyType_t;
108
+ #endif
109
+
110
+ #endif /* !__CUDACC_RTC_MINIMAL__ */
111
+ #endif /* !__LIBRARY_TYPES_H__ */
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/nvperf_cuda_host.h ADDED
@@ -0,0 +1,179 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #ifndef NVPERF_CUDA_HOST_H
2
+ #define NVPERF_CUDA_HOST_H
3
+
4
+ /*
5
+ * Copyright 2014-2024 NVIDIA Corporation. All rights reserved.
6
+ *
7
+ * NOTICE TO USER:
8
+ *
9
+ * This source code is subject to NVIDIA ownership rights under U.S. and
10
+ * international Copyright laws.
11
+ *
12
+ * This software and the information contained herein is PROPRIETARY and
13
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and conditions
14
+ * of a form of NVIDIA software license agreement.
15
+ *
16
+ * NVIDIA MAKES NO REPRESENTATION ABOUT THE SUITABILITY OF THIS SOURCE
17
+ * CODE FOR ANY PURPOSE. IT IS PROVIDED "AS IS" WITHOUT EXPRESS OR
18
+ * IMPLIED WARRANTY OF ANY KIND. NVIDIA DISCLAIMS ALL WARRANTIES WITH
19
+ * REGARD TO THIS SOURCE CODE, INCLUDING ALL IMPLIED WARRANTIES OF
20
+ * MERCHANTABILITY, NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
21
+ * IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL,
22
+ * OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS
23
+ * OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE
24
+ * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE
25
+ * OR PERFORMANCE OF THIS SOURCE CODE.
26
+ *
27
+ * U.S. Government End Users. This source code is a "commercial item" as
28
+ * that term is defined at 48 C.F.R. 2.101 (OCT 1995), consisting of
29
+ * "commercial computer software" and "commercial computer software
30
+ * documentation" as such terms are used in 48 C.F.R. 12.212 (SEPT 1995)
31
+ * and is provided to the U.S. Government only as a commercial end item.
32
+ * Consistent with 48 C.F.R.12.212 and 48 C.F.R. 227.7202-1 through
33
+ * 227.7202-4 (JUNE 1995), all U.S. Government End Users acquire the
34
+ * source code with only those rights set forth herein.
35
+ *
36
+ * Any use of this source code in individual and commercial software must
37
+ * include, in the user documentation and internal comments to the code,
38
+ * the above Disclaimer and U.S. Government End Users Notice.
39
+ */
40
+
41
+ #include <stddef.h>
42
+ #include <stdint.h>
43
+ #include "nvperf_common.h"
44
+ #include "nvperf_host.h"
45
+
46
+ #if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
47
+ #pragma GCC visibility push(default)
48
+ #if !defined(NVPW_LOCAL)
49
+ #define NVPW_LOCAL __attribute__ ((visibility ("hidden")))
50
+ #endif
51
+ #else
52
+ #if !defined(NVPW_LOCAL)
53
+ #define NVPW_LOCAL
54
+ #endif
55
+ #endif
56
+
57
+ #ifdef __cplusplus
58
+ extern "C" {
59
+ #endif
60
+
61
+ /**
62
+ * @file nvperf_cuda_host.h
63
+ */
64
+
65
+ typedef struct NVPW_CUDA_RawMetricsConfig_Create_Params
66
+ {
67
+ /// [in]
68
+ size_t structSize;
69
+ /// [in] assign to NULL
70
+ void* pPriv;
71
+ /// [in]
72
+ NVPA_ActivityKind activityKind;
73
+ /// [in]
74
+ const char* pChipName;
75
+ /// [out] new NVPA_RawMetricsConfig object
76
+ struct NVPA_RawMetricsConfig* pRawMetricsConfig;
77
+ } NVPW_CUDA_RawMetricsConfig_Create_Params;
78
+ #define NVPW_CUDA_RawMetricsConfig_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_RawMetricsConfig_Create_Params, pRawMetricsConfig)
79
+
80
+ NVPA_Status NVPW_CUDA_RawMetricsConfig_Create(NVPW_CUDA_RawMetricsConfig_Create_Params* pParams);
81
+
82
+ typedef struct NVPW_CUDA_RawMetricsConfig_Create_V2_Params
83
+ {
84
+ /// [in]
85
+ size_t structSize;
86
+ /// [in] assign to NULL
87
+ void* pPriv;
88
+ /// [in]
89
+ NVPA_ActivityKind activityKind;
90
+ /// [in] accepted for chips supported at the time-of-release.
91
+ const char* pChipName;
92
+ /// [in] buffer with counter availability image - required for future chip support
93
+ const uint8_t* pCounterAvailabilityImage;
94
+ /// [out] new NVPA_RawMetricsConfig object
95
+ struct NVPA_RawMetricsConfig* pRawMetricsConfig;
96
+ } NVPW_CUDA_RawMetricsConfig_Create_V2_Params;
97
+ #define NVPW_CUDA_RawMetricsConfig_Create_V2_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_RawMetricsConfig_Create_V2_Params, pRawMetricsConfig)
98
+
99
+ /// Use either 'pChipName' or 'pCounterAvailabilityImage'.
100
+ NVPA_Status NVPW_CUDA_RawMetricsConfig_Create_V2(NVPW_CUDA_RawMetricsConfig_Create_V2_Params* pParams);
101
+
102
+ typedef struct NVPW_CUDA_CounterDataBuilder_Create_Params
103
+ {
104
+ /// [in]
105
+ size_t structSize;
106
+ /// [in] assign to NULL
107
+ void* pPriv;
108
+ /// [in] accepted for chips supported at the time-of-release.
109
+ const char* pChipName;
110
+ /// [in] buffer with counter availability image - required for future chip support
111
+ const uint8_t* pCounterAvailabilityImage;
112
+ /// [out] new NVPA_CounterDataBuilder object
113
+ struct NVPA_CounterDataBuilder* pCounterDataBuilder;
114
+ } NVPW_CUDA_CounterDataBuilder_Create_Params;
115
+ #define NVPW_CUDA_CounterDataBuilder_Create_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_CounterDataBuilder_Create_Params, pCounterDataBuilder)
116
+
117
+ /// Use either 'pChipName' or 'pCounterAvailabilityImage'.
118
+ NVPA_Status NVPW_CUDA_CounterDataBuilder_Create(NVPW_CUDA_CounterDataBuilder_Create_Params* pParams);
119
+
120
+ typedef struct NVPW_MetricsEvaluator NVPW_MetricsEvaluator;
121
+
122
+ typedef struct NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params
123
+ {
124
+ /// [in]
125
+ size_t structSize;
126
+ /// [in] assign to NULL
127
+ void* pPriv;
128
+ /// [in] accepted for chips supported at the time-of-release.
129
+ const char* pChipName;
130
+ /// [in] buffer with counter availability image - required for future chip support
131
+ const uint8_t* pCounterAvailabilityImage;
132
+ /// [out]
133
+ size_t scratchBufferSize;
134
+ } NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params;
135
+ #define NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params, scratchBufferSize)
136
+
137
+ /// Use either 'pChipName' or 'pCounterAvailabilityImage'.
138
+ NVPA_Status NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize(NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize_Params* pParams);
139
+
140
+ typedef struct NVPW_CUDA_MetricsEvaluator_Initialize_Params
141
+ {
142
+ /// [in]
143
+ size_t structSize;
144
+ /// [in] assign to NULL
145
+ void* pPriv;
146
+ /// [in]
147
+ uint8_t* pScratchBuffer;
148
+ /// [in] the size of the 'pScratchBuffer' array, should be at least the size of the 'scratchBufferSize' returned
149
+ /// by 'NVPW_CUDA_MetricsEvaluator_CalculateScratchBufferSize'
150
+ size_t scratchBufferSize;
151
+ /// [in] accepted for chips supported at the time-of-release.
152
+ const char* pChipName;
153
+ /// [in] buffer with counter availability image - required for future chip support
154
+ const uint8_t* pCounterAvailabilityImage;
155
+ /// [in]
156
+ const uint8_t* pCounterDataImage;
157
+ /// [in] must be provided if 'pCounterDataImage' is not NULL
158
+ size_t counterDataImageSize;
159
+ /// [out]
160
+ struct NVPW_MetricsEvaluator* pMetricsEvaluator;
161
+ } NVPW_CUDA_MetricsEvaluator_Initialize_Params;
162
+ #define NVPW_CUDA_MetricsEvaluator_Initialize_Params_STRUCT_SIZE NVPA_STRUCT_SIZE(NVPW_CUDA_MetricsEvaluator_Initialize_Params, pMetricsEvaluator)
163
+
164
+ /// Use one of 'pChipName', 'pCounterAvailabilityImage', or 'pCounterDataImage'. 'pChipName' or
165
+ /// 'pCounterAvailabilityImage' will create a metrics evaluator based on a virtual device while 'pCounterDataImage'
166
+ /// will create a metrics evaluator based on the actual device.
167
+ NVPA_Status NVPW_CUDA_MetricsEvaluator_Initialize(NVPW_CUDA_MetricsEvaluator_Initialize_Params* pParams);
168
+
169
+
170
+
171
+ #ifdef __cplusplus
172
+ } // extern "C"
173
+ #endif
174
+
175
+ #if defined(__GNUC__) && defined(NVPA_SHARED_LIB)
176
+ #pragma GCC visibility pop
177
+ #endif
178
+
179
+ #endif // NVPERF_CUDA_HOST_H
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_32_intrinsics.hpp ADDED
@@ -0,0 +1,588 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2020 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_32_INTRINSICS_HPP__)
51
+ #define __SM_32_INTRINSICS_HPP__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __SM_32_INTRINSICS_DECL__ __device__
55
+ #else /* !__CUDACC_RTC__ */
56
+ #define __SM_32_INTRINSICS_DECL__ static __device__ __inline__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ #if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #include "cuda_runtime_api.h"
70
+
71
+ // In here are intrinsics which are built in to the compiler. These may be
72
+ // referenced by intrinsic implementations from this file.
73
+ extern "C"
74
+ {
75
+ // There are no intrinsics built in to the compiler for SM-3.5,
76
+ // all intrinsics are now implemented as inline PTX below.
77
+ }
78
+
79
+ /*******************************************************************************
80
+ * *
81
+ * Below are implementations of SM-3.5 intrinsics which are included as *
82
+ * source (instead of being built in to the compiler) *
83
+ * *
84
+ *******************************************************************************/
85
+
86
+ // LDG is a "load from global via texture path" command which can exhibit higher
87
+ // bandwidth on GK110 than a regular LD.
88
+ // Define a different pointer storage size for 64 and 32 bit
89
+ #if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
90
+ #define __LDG_PTR "l"
91
+ #else
92
+ #define __LDG_PTR "r"
93
+ #endif
94
+
95
+ /******************************************************************************
96
+ * __ldg *
97
+ ******************************************************************************/
98
+
99
+ // Size of long is architecture and OS specific.
100
+ #if defined(__LP64__) // 64 bits
101
+ __SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
102
+ __SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
103
+ #else // 32 bits
104
+ __SM_32_INTRINSICS_DECL__ long __ldg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
105
+ __SM_32_INTRINSICS_DECL__ unsigned long __ldg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.nc.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
106
+ #endif
107
+
108
+
109
+ __SM_32_INTRINSICS_DECL__ char __ldg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
110
+ __SM_32_INTRINSICS_DECL__ signed char __ldg(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
111
+ __SM_32_INTRINSICS_DECL__ short __ldg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.s16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
112
+ __SM_32_INTRINSICS_DECL__ int __ldg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
113
+ __SM_32_INTRINSICS_DECL__ long long __ldg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
114
+ __SM_32_INTRINSICS_DECL__ char2 __ldg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.nc.v2.s8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
115
+ __SM_32_INTRINSICS_DECL__ char4 __ldg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.nc.v4.s8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
116
+ __SM_32_INTRINSICS_DECL__ short2 __ldg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.nc.v2.s16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
117
+ __SM_32_INTRINSICS_DECL__ short4 __ldg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.nc.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
118
+ __SM_32_INTRINSICS_DECL__ int2 __ldg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.nc.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
119
+ __SM_32_INTRINSICS_DECL__ int4 __ldg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.nc.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
120
+ __SM_32_INTRINSICS_DECL__ longlong2 __ldg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.nc.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
121
+
122
+ __SM_32_INTRINSICS_DECL__ unsigned char __ldg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (unsigned char)ret; }
123
+ __SM_32_INTRINSICS_DECL__ unsigned short __ldg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.nc.u16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
124
+ __SM_32_INTRINSICS_DECL__ unsigned int __ldg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.nc.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
125
+ __SM_32_INTRINSICS_DECL__ unsigned long long __ldg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.nc.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
126
+ __SM_32_INTRINSICS_DECL__ uchar2 __ldg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.nc.v2.u8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
127
+ __SM_32_INTRINSICS_DECL__ uchar4 __ldg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.nc.v4.u8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
128
+ __SM_32_INTRINSICS_DECL__ ushort2 __ldg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.nc.v2.u16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
129
+ __SM_32_INTRINSICS_DECL__ ushort4 __ldg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.nc.v4.u16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
130
+ __SM_32_INTRINSICS_DECL__ uint2 __ldg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.nc.v2.u32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
131
+ __SM_32_INTRINSICS_DECL__ uint4 __ldg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.nc.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
132
+ __SM_32_INTRINSICS_DECL__ ulonglong2 __ldg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.nc.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
133
+
134
+ __SM_32_INTRINSICS_DECL__ float __ldg(const float *ptr) { float ret; asm volatile ("ld.global.nc.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
135
+ __SM_32_INTRINSICS_DECL__ double __ldg(const double *ptr) { double ret; asm volatile ("ld.global.nc.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
136
+ __SM_32_INTRINSICS_DECL__ float2 __ldg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.nc.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
137
+ __SM_32_INTRINSICS_DECL__ float4 __ldg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.nc.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
138
+ __SM_32_INTRINSICS_DECL__ double2 __ldg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.nc.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
139
+
140
+
141
+ /******************************************************************************
142
+ * __ldcg *
143
+ ******************************************************************************/
144
+
145
+ // Size of long is architecture and OS specific.
146
+ #if defined(__LP64__) // 64 bits
147
+ __SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
148
+ __SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
149
+ #else // 32 bits
150
+ __SM_32_INTRINSICS_DECL__ long __ldcg(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
151
+ __SM_32_INTRINSICS_DECL__ unsigned long __ldcg(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cg.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
152
+ #endif
153
+
154
+
155
+ __SM_32_INTRINSICS_DECL__ char __ldcg(const char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
156
+ __SM_32_INTRINSICS_DECL__ signed char __ldcg(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
157
+ __SM_32_INTRINSICS_DECL__ short __ldcg(const short *ptr) { unsigned short ret; asm volatile ("ld.global.cg.s16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
158
+ __SM_32_INTRINSICS_DECL__ int __ldcg(const int *ptr) { unsigned int ret; asm volatile ("ld.global.cg.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
159
+ __SM_32_INTRINSICS_DECL__ long long __ldcg(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cg.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
160
+ __SM_32_INTRINSICS_DECL__ char2 __ldcg(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.cg.v2.s8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
161
+ __SM_32_INTRINSICS_DECL__ char4 __ldcg(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.cg.v4.s8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
162
+ __SM_32_INTRINSICS_DECL__ short2 __ldcg(const short2 *ptr) { short2 ret; asm volatile ("ld.global.cg.v2.s16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
163
+ __SM_32_INTRINSICS_DECL__ short4 __ldcg(const short4 *ptr) { short4 ret; asm volatile ("ld.global.cg.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
164
+ __SM_32_INTRINSICS_DECL__ int2 __ldcg(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cg.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
165
+ __SM_32_INTRINSICS_DECL__ int4 __ldcg(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cg.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
166
+ __SM_32_INTRINSICS_DECL__ longlong2 __ldcg(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.cg.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
167
+
168
+ __SM_32_INTRINSICS_DECL__ unsigned char __ldcg(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.cg.u8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (unsigned char)ret; }
169
+ __SM_32_INTRINSICS_DECL__ unsigned short __ldcg(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.cg.u16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
170
+ __SM_32_INTRINSICS_DECL__ unsigned int __ldcg(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.cg.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
171
+ __SM_32_INTRINSICS_DECL__ unsigned long long __ldcg(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cg.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
172
+ __SM_32_INTRINSICS_DECL__ uchar2 __ldcg(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.cg.v2.u8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
173
+ __SM_32_INTRINSICS_DECL__ uchar4 __ldcg(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.cg.v4.u8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
174
+ __SM_32_INTRINSICS_DECL__ ushort2 __ldcg(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.cg.v2.u16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
175
+ __SM_32_INTRINSICS_DECL__ ushort4 __ldcg(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.cg.v4.u16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
176
+ __SM_32_INTRINSICS_DECL__ uint2 __ldcg(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.cg.v2.u32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
177
+ __SM_32_INTRINSICS_DECL__ uint4 __ldcg(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.cg.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
178
+ __SM_32_INTRINSICS_DECL__ ulonglong2 __ldcg(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.cg.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
179
+
180
+ __SM_32_INTRINSICS_DECL__ float __ldcg(const float *ptr) { float ret; asm volatile ("ld.global.cg.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
181
+ __SM_32_INTRINSICS_DECL__ double __ldcg(const double *ptr) { double ret; asm volatile ("ld.global.cg.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
182
+ __SM_32_INTRINSICS_DECL__ float2 __ldcg(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cg.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
183
+ __SM_32_INTRINSICS_DECL__ float4 __ldcg(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cg.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
184
+ __SM_32_INTRINSICS_DECL__ double2 __ldcg(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cg.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
185
+
186
+ /******************************************************************************
187
+ * __ldca *
188
+ ******************************************************************************/
189
+
190
+ // Size of long is architecture and OS specific.
191
+ #if defined(__LP64__) // 64 bits
192
+ __SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
193
+ __SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
194
+ #else // 32 bits
195
+ __SM_32_INTRINSICS_DECL__ long __ldca(const long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
196
+ __SM_32_INTRINSICS_DECL__ unsigned long __ldca(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.ca.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
197
+ #endif
198
+
199
+
200
+ __SM_32_INTRINSICS_DECL__ char __ldca(const char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
201
+ __SM_32_INTRINSICS_DECL__ signed char __ldca(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
202
+ __SM_32_INTRINSICS_DECL__ short __ldca(const short *ptr) { unsigned short ret; asm volatile ("ld.global.ca.s16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
203
+ __SM_32_INTRINSICS_DECL__ int __ldca(const int *ptr) { unsigned int ret; asm volatile ("ld.global.ca.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
204
+ __SM_32_INTRINSICS_DECL__ long long __ldca(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.ca.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
205
+ __SM_32_INTRINSICS_DECL__ char2 __ldca(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.ca.v2.s8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
206
+ __SM_32_INTRINSICS_DECL__ char4 __ldca(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.ca.v4.s8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
207
+ __SM_32_INTRINSICS_DECL__ short2 __ldca(const short2 *ptr) { short2 ret; asm volatile ("ld.global.ca.v2.s16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
208
+ __SM_32_INTRINSICS_DECL__ short4 __ldca(const short4 *ptr) { short4 ret; asm volatile ("ld.global.ca.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
209
+ __SM_32_INTRINSICS_DECL__ int2 __ldca(const int2 *ptr) { int2 ret; asm volatile ("ld.global.ca.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
210
+ __SM_32_INTRINSICS_DECL__ int4 __ldca(const int4 *ptr) { int4 ret; asm volatile ("ld.global.ca.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
211
+ __SM_32_INTRINSICS_DECL__ longlong2 __ldca(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.ca.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
212
+
213
+ __SM_32_INTRINSICS_DECL__ unsigned char __ldca(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.ca.u8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (unsigned char)ret; }
214
+ __SM_32_INTRINSICS_DECL__ unsigned short __ldca(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.ca.u16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
215
+ __SM_32_INTRINSICS_DECL__ unsigned int __ldca(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.ca.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
216
+ __SM_32_INTRINSICS_DECL__ unsigned long long __ldca(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.ca.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
217
+ __SM_32_INTRINSICS_DECL__ uchar2 __ldca(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.ca.v2.u8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
218
+ __SM_32_INTRINSICS_DECL__ uchar4 __ldca(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.ca.v4.u8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
219
+ __SM_32_INTRINSICS_DECL__ ushort2 __ldca(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.ca.v2.u16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
220
+ __SM_32_INTRINSICS_DECL__ ushort4 __ldca(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.ca.v4.u16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
221
+ __SM_32_INTRINSICS_DECL__ uint2 __ldca(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.ca.v2.u32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
222
+ __SM_32_INTRINSICS_DECL__ uint4 __ldca(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.ca.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
223
+ __SM_32_INTRINSICS_DECL__ ulonglong2 __ldca(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.ca.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
224
+
225
+ __SM_32_INTRINSICS_DECL__ float __ldca(const float *ptr) { float ret; asm volatile ("ld.global.ca.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
226
+ __SM_32_INTRINSICS_DECL__ double __ldca(const double *ptr) { double ret; asm volatile ("ld.global.ca.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
227
+ __SM_32_INTRINSICS_DECL__ float2 __ldca(const float2 *ptr) { float2 ret; asm volatile ("ld.global.ca.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
228
+ __SM_32_INTRINSICS_DECL__ float4 __ldca(const float4 *ptr) { float4 ret; asm volatile ("ld.global.ca.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
229
+ __SM_32_INTRINSICS_DECL__ double2 __ldca(const double2 *ptr) { double2 ret; asm volatile ("ld.global.ca.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
230
+
231
+ /******************************************************************************
232
+ * __ldcs *
233
+ ******************************************************************************/
234
+
235
+ // Size of long is architecture and OS specific.
236
+ #if defined(__LP64__) // 64 bits
237
+ __SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long)ret; }
238
+ __SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
239
+ #else // 32 bits
240
+ __SM_32_INTRINSICS_DECL__ long __ldcs(const long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (long)ret; }
241
+ __SM_32_INTRINSICS_DECL__ unsigned long __ldcs(const unsigned long *ptr) { unsigned long ret; asm volatile ("ld.global.cs.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
242
+ #endif
243
+
244
+
245
+ __SM_32_INTRINSICS_DECL__ char __ldcs(const char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (char)ret; }
246
+ __SM_32_INTRINSICS_DECL__ signed char __ldcs(const signed char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (signed char)ret; }
247
+ __SM_32_INTRINSICS_DECL__ short __ldcs(const short *ptr) { unsigned short ret; asm volatile ("ld.global.cs.s16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return (short)ret; }
248
+ __SM_32_INTRINSICS_DECL__ int __ldcs(const int *ptr) { unsigned int ret; asm volatile ("ld.global.cs.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (int)ret; }
249
+ __SM_32_INTRINSICS_DECL__ long long __ldcs(const long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cs.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return (long long)ret; }
250
+ __SM_32_INTRINSICS_DECL__ char2 __ldcs(const char2 *ptr) { char2 ret; int2 tmp; asm volatile ("ld.global.cs.v2.s8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
251
+ __SM_32_INTRINSICS_DECL__ char4 __ldcs(const char4 *ptr) { char4 ret; int4 tmp; asm volatile ("ld.global.cs.v4.s8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
252
+ __SM_32_INTRINSICS_DECL__ short2 __ldcs(const short2 *ptr) { short2 ret; asm volatile ("ld.global.cs.v2.s16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
253
+ __SM_32_INTRINSICS_DECL__ short4 __ldcs(const short4 *ptr) { short4 ret; asm volatile ("ld.global.cs.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
254
+ __SM_32_INTRINSICS_DECL__ int2 __ldcs(const int2 *ptr) { int2 ret; asm volatile ("ld.global.cs.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
255
+ __SM_32_INTRINSICS_DECL__ int4 __ldcs(const int4 *ptr) { int4 ret; asm volatile ("ld.global.cs.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
256
+ __SM_32_INTRINSICS_DECL__ longlong2 __ldcs(const longlong2 *ptr) { longlong2 ret; asm volatile ("ld.global.cs.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
257
+
258
+ __SM_32_INTRINSICS_DECL__ unsigned char __ldcs(const unsigned char *ptr) { unsigned int ret; asm volatile ("ld.global.cs.u8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return (unsigned char)ret; }
259
+ __SM_32_INTRINSICS_DECL__ unsigned short __ldcs(const unsigned short *ptr) { unsigned short ret; asm volatile ("ld.global.cs.u16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr)); return ret; }
260
+ __SM_32_INTRINSICS_DECL__ unsigned int __ldcs(const unsigned int *ptr) { unsigned int ret; asm volatile ("ld.global.cs.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr)); return ret; }
261
+ __SM_32_INTRINSICS_DECL__ unsigned long long __ldcs(const unsigned long long *ptr) { unsigned long long ret; asm volatile ("ld.global.cs.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr)); return ret; }
262
+ __SM_32_INTRINSICS_DECL__ uchar2 __ldcs(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm volatile ("ld.global.cs.v2.u8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
263
+ __SM_32_INTRINSICS_DECL__ uchar4 __ldcs(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm volatile ("ld.global.cs.v4.u8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr)); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
264
+ __SM_32_INTRINSICS_DECL__ ushort2 __ldcs(const ushort2 *ptr) { ushort2 ret; asm volatile ("ld.global.cs.v2.u16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr)); return ret; }
265
+ __SM_32_INTRINSICS_DECL__ ushort4 __ldcs(const ushort4 *ptr) { ushort4 ret; asm volatile ("ld.global.cs.v4.u16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr)); return ret; }
266
+ __SM_32_INTRINSICS_DECL__ uint2 __ldcs(const uint2 *ptr) { uint2 ret; asm volatile ("ld.global.cs.v2.u32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr)); return ret; }
267
+ __SM_32_INTRINSICS_DECL__ uint4 __ldcs(const uint4 *ptr) { uint4 ret; asm volatile ("ld.global.cs.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr)); return ret; }
268
+ __SM_32_INTRINSICS_DECL__ ulonglong2 __ldcs(const ulonglong2 *ptr) { ulonglong2 ret; asm volatile ("ld.global.cs.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr)); return ret; }
269
+
270
+ __SM_32_INTRINSICS_DECL__ float __ldcs(const float *ptr) { float ret; asm volatile ("ld.global.cs.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr)); return ret; }
271
+ __SM_32_INTRINSICS_DECL__ double __ldcs(const double *ptr) { double ret; asm volatile ("ld.global.cs.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr)); return ret; }
272
+ __SM_32_INTRINSICS_DECL__ float2 __ldcs(const float2 *ptr) { float2 ret; asm volatile ("ld.global.cs.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr)); return ret; }
273
+ __SM_32_INTRINSICS_DECL__ float4 __ldcs(const float4 *ptr) { float4 ret; asm volatile ("ld.global.cs.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr)); return ret; }
274
+ __SM_32_INTRINSICS_DECL__ double2 __ldcs(const double2 *ptr) { double2 ret; asm volatile ("ld.global.cs.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr)); return ret; }
275
+
276
+ /******************************************************************************
277
+ * __ldlu *
278
+ ******************************************************************************/
279
+
280
+ // Size of long is architecture and OS specific.
281
+ #if defined(__LP64__) // 64 bits
282
+ __SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) { unsigned long ret; asm ("ld.global.lu.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
283
+ __SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.lu.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
284
+ #else // 32 bits
285
+ __SM_32_INTRINSICS_DECL__ long __ldlu(const long *ptr) { unsigned long ret; asm ("ld.global.lu.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
286
+ __SM_32_INTRINSICS_DECL__ unsigned long __ldlu(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.lu.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
287
+ #endif
288
+
289
+
290
+ __SM_32_INTRINSICS_DECL__ char __ldlu(const char *ptr) { unsigned int ret; asm ("ld.global.lu.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (char)ret; }
291
+ __SM_32_INTRINSICS_DECL__ signed char __ldlu(const signed char *ptr) { unsigned int ret; asm ("ld.global.lu.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (signed char)ret; }
292
+ __SM_32_INTRINSICS_DECL__ short __ldlu(const short *ptr) { unsigned short ret; asm ("ld.global.lu.s16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return (short)ret; }
293
+ __SM_32_INTRINSICS_DECL__ int __ldlu(const int *ptr) { unsigned int ret; asm ("ld.global.lu.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (int)ret; }
294
+ __SM_32_INTRINSICS_DECL__ long long __ldlu(const long long *ptr) { unsigned long long ret; asm ("ld.global.lu.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long long)ret; }
295
+ __SM_32_INTRINSICS_DECL__ char2 __ldlu(const char2 *ptr) { char2 ret; int2 tmp; asm ("ld.global.lu.v2.s8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
296
+ __SM_32_INTRINSICS_DECL__ char4 __ldlu(const char4 *ptr) { char4 ret; int4 tmp; asm ("ld.global.lu.v4.s8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
297
+ __SM_32_INTRINSICS_DECL__ short2 __ldlu(const short2 *ptr) { short2 ret; asm ("ld.global.lu.v2.s16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
298
+ __SM_32_INTRINSICS_DECL__ short4 __ldlu(const short4 *ptr) { short4 ret; asm ("ld.global.lu.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
299
+ __SM_32_INTRINSICS_DECL__ int2 __ldlu(const int2 *ptr) { int2 ret; asm ("ld.global.lu.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
300
+ __SM_32_INTRINSICS_DECL__ int4 __ldlu(const int4 *ptr) { int4 ret; asm ("ld.global.lu.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
301
+ __SM_32_INTRINSICS_DECL__ longlong2 __ldlu(const longlong2 *ptr) { longlong2 ret; asm ("ld.global.lu.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
302
+
303
+ __SM_32_INTRINSICS_DECL__ unsigned char __ldlu(const unsigned char *ptr) { unsigned int ret; asm ("ld.global.lu.u8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (unsigned char)ret; }
304
+ __SM_32_INTRINSICS_DECL__ unsigned short __ldlu(const unsigned short *ptr) { unsigned short ret; asm ("ld.global.lu.u16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
305
+ __SM_32_INTRINSICS_DECL__ unsigned int __ldlu(const unsigned int *ptr) { unsigned int ret; asm ("ld.global.lu.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
306
+ __SM_32_INTRINSICS_DECL__ unsigned long long __ldlu(const unsigned long long *ptr) { unsigned long long ret; asm ("ld.global.lu.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
307
+ __SM_32_INTRINSICS_DECL__ uchar2 __ldlu(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm ("ld.global.lu.v2.u8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
308
+ __SM_32_INTRINSICS_DECL__ uchar4 __ldlu(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm ("ld.global.lu.v4.u8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
309
+ __SM_32_INTRINSICS_DECL__ ushort2 __ldlu(const ushort2 *ptr) { ushort2 ret; asm ("ld.global.lu.v2.u16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
310
+ __SM_32_INTRINSICS_DECL__ ushort4 __ldlu(const ushort4 *ptr) { ushort4 ret; asm ("ld.global.lu.v4.u16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
311
+ __SM_32_INTRINSICS_DECL__ uint2 __ldlu(const uint2 *ptr) { uint2 ret; asm ("ld.global.lu.v2.u32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
312
+ __SM_32_INTRINSICS_DECL__ uint4 __ldlu(const uint4 *ptr) { uint4 ret; asm ("ld.global.lu.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
313
+ __SM_32_INTRINSICS_DECL__ ulonglong2 __ldlu(const ulonglong2 *ptr) { ulonglong2 ret; asm ("ld.global.lu.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
314
+
315
+ __SM_32_INTRINSICS_DECL__ float __ldlu(const float *ptr) { float ret; asm ("ld.global.lu.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
316
+ __SM_32_INTRINSICS_DECL__ double __ldlu(const double *ptr) { double ret; asm ("ld.global.lu.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
317
+ __SM_32_INTRINSICS_DECL__ float2 __ldlu(const float2 *ptr) { float2 ret; asm ("ld.global.lu.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
318
+ __SM_32_INTRINSICS_DECL__ float4 __ldlu(const float4 *ptr) { float4 ret; asm ("ld.global.lu.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
319
+ __SM_32_INTRINSICS_DECL__ double2 __ldlu(const double2 *ptr) { double2 ret; asm ("ld.global.lu.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
320
+
321
+ /******************************************************************************
322
+ * __ldcv *
323
+ ******************************************************************************/
324
+
325
+ // Size of long is architecture and OS specific.
326
+ #if defined(__LP64__) // 64 bits
327
+ __SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) { unsigned long ret; asm ("ld.global.cv.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
328
+ __SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.cv.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
329
+ #else // 32 bits
330
+ __SM_32_INTRINSICS_DECL__ long __ldcv(const long *ptr) { unsigned long ret; asm ("ld.global.cv.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (long)ret; }
331
+ __SM_32_INTRINSICS_DECL__ unsigned long __ldcv(const unsigned long *ptr) { unsigned long ret; asm ("ld.global.cv.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
332
+ #endif
333
+
334
+
335
+ __SM_32_INTRINSICS_DECL__ char __ldcv(const char *ptr) { unsigned int ret; asm ("ld.global.cv.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (char)ret; }
336
+ __SM_32_INTRINSICS_DECL__ signed char __ldcv(const signed char *ptr) { unsigned int ret; asm ("ld.global.cv.s8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (signed char)ret; }
337
+ __SM_32_INTRINSICS_DECL__ short __ldcv(const short *ptr) { unsigned short ret; asm ("ld.global.cv.s16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return (short)ret; }
338
+ __SM_32_INTRINSICS_DECL__ int __ldcv(const int *ptr) { unsigned int ret; asm ("ld.global.cv.s32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (int)ret; }
339
+ __SM_32_INTRINSICS_DECL__ long long __ldcv(const long long *ptr) { unsigned long long ret; asm ("ld.global.cv.s64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return (long long)ret; }
340
+ __SM_32_INTRINSICS_DECL__ char2 __ldcv(const char2 *ptr) { char2 ret; int2 tmp; asm ("ld.global.cv.v2.s8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; return ret; }
341
+ __SM_32_INTRINSICS_DECL__ char4 __ldcv(const char4 *ptr) { char4 ret; int4 tmp; asm ("ld.global.cv.v4.s8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (char)tmp.x; ret.y = (char)tmp.y; ret.z = (char)tmp.z; ret.w = (char)tmp.w; return ret; }
342
+ __SM_32_INTRINSICS_DECL__ short2 __ldcv(const short2 *ptr) { short2 ret; asm ("ld.global.cv.v2.s16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
343
+ __SM_32_INTRINSICS_DECL__ short4 __ldcv(const short4 *ptr) { short4 ret; asm ("ld.global.cv.v4.s16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
344
+ __SM_32_INTRINSICS_DECL__ int2 __ldcv(const int2 *ptr) { int2 ret; asm ("ld.global.cv.v2.s32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
345
+ __SM_32_INTRINSICS_DECL__ int4 __ldcv(const int4 *ptr) { int4 ret; asm ("ld.global.cv.v4.s32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
346
+ __SM_32_INTRINSICS_DECL__ longlong2 __ldcv(const longlong2 *ptr) { longlong2 ret; asm ("ld.global.cv.v2.s64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
347
+
348
+ __SM_32_INTRINSICS_DECL__ unsigned char __ldcv(const unsigned char *ptr) { unsigned int ret; asm ("ld.global.cv.u8 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return (unsigned char)ret; }
349
+ __SM_32_INTRINSICS_DECL__ unsigned short __ldcv(const unsigned short *ptr) { unsigned short ret; asm ("ld.global.cv.u16 %0, [%1];" : "=h"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
350
+ __SM_32_INTRINSICS_DECL__ unsigned int __ldcv(const unsigned int *ptr) { unsigned int ret; asm ("ld.global.cv.u32 %0, [%1];" : "=r"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
351
+ __SM_32_INTRINSICS_DECL__ unsigned long long __ldcv(const unsigned long long *ptr) { unsigned long long ret; asm ("ld.global.cv.u64 %0, [%1];" : "=l"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
352
+ __SM_32_INTRINSICS_DECL__ uchar2 __ldcv(const uchar2 *ptr) { uchar2 ret; uint2 tmp; asm ("ld.global.cv.v2.u8 {%0,%1}, [%2];" : "=r"(tmp.x), "=r"(tmp.y) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; return ret; }
353
+ __SM_32_INTRINSICS_DECL__ uchar4 __ldcv(const uchar4 *ptr) { uchar4 ret; uint4 tmp; asm ("ld.global.cv.v4.u8 {%0,%1,%2,%3}, [%4];" : "=r"(tmp.x), "=r"(tmp.y), "=r"(tmp.z), "=r"(tmp.w) : __LDG_PTR (ptr) : "memory"); ret.x = (unsigned char)tmp.x; ret.y = (unsigned char)tmp.y; ret.z = (unsigned char)tmp.z; ret.w = (unsigned char)tmp.w; return ret; }
354
+ __SM_32_INTRINSICS_DECL__ ushort2 __ldcv(const ushort2 *ptr) { ushort2 ret; asm ("ld.global.cv.v2.u16 {%0,%1}, [%2];" : "=h"(ret.x), "=h"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
355
+ __SM_32_INTRINSICS_DECL__ ushort4 __ldcv(const ushort4 *ptr) { ushort4 ret; asm ("ld.global.cv.v4.u16 {%0,%1,%2,%3}, [%4];" : "=h"(ret.x), "=h"(ret.y), "=h"(ret.z), "=h"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
356
+ __SM_32_INTRINSICS_DECL__ uint2 __ldcv(const uint2 *ptr) { uint2 ret; asm ("ld.global.cv.v2.u32 {%0,%1}, [%2];" : "=r"(ret.x), "=r"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
357
+ __SM_32_INTRINSICS_DECL__ uint4 __ldcv(const uint4 *ptr) { uint4 ret; asm ("ld.global.cv.v4.u32 {%0,%1,%2,%3}, [%4];" : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
358
+ __SM_32_INTRINSICS_DECL__ ulonglong2 __ldcv(const ulonglong2 *ptr) { ulonglong2 ret; asm ("ld.global.cv.v2.u64 {%0,%1}, [%2];" : "=l"(ret.x), "=l"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
359
+
360
+ __SM_32_INTRINSICS_DECL__ float __ldcv(const float *ptr) { float ret; asm ("ld.global.cv.f32 %0, [%1];" : "=f"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
361
+ __SM_32_INTRINSICS_DECL__ double __ldcv(const double *ptr) { double ret; asm ("ld.global.cv.f64 %0, [%1];" : "=d"(ret) : __LDG_PTR (ptr) : "memory"); return ret; }
362
+ __SM_32_INTRINSICS_DECL__ float2 __ldcv(const float2 *ptr) { float2 ret; asm ("ld.global.cv.v2.f32 {%0,%1}, [%2];" : "=f"(ret.x), "=f"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
363
+ __SM_32_INTRINSICS_DECL__ float4 __ldcv(const float4 *ptr) { float4 ret; asm ("ld.global.cv.v4.f32 {%0,%1,%2,%3}, [%4];" : "=f"(ret.x), "=f"(ret.y), "=f"(ret.z), "=f"(ret.w) : __LDG_PTR (ptr) : "memory"); return ret; }
364
+ __SM_32_INTRINSICS_DECL__ double2 __ldcv(const double2 *ptr) { double2 ret; asm ("ld.global.cv.v2.f64 {%0,%1}, [%2];" : "=d"(ret.x), "=d"(ret.y) : __LDG_PTR (ptr) : "memory"); return ret; }
365
+
366
+ /******************************************************************************
367
+ * __stwb *
368
+ ******************************************************************************/
369
+
370
+ // Size of long is architecture and OS specific.
371
+ #if defined(__LP64__) // 64 bits
372
+ __SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) { asm ("st.global.wb.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
373
+ __SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) { asm ("st.global.wb.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
374
+ #else // 32 bits
375
+ __SM_32_INTRINSICS_DECL__ void __stwb(long *ptr, long value) { asm ("st.global.wb.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
376
+ __SM_32_INTRINSICS_DECL__ void __stwb(unsigned long *ptr, unsigned long value) { asm ("st.global.wb.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
377
+ #endif
378
+
379
+
380
+ __SM_32_INTRINSICS_DECL__ void __stwb(char *ptr, char value) { asm ("st.global.wb.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
381
+ __SM_32_INTRINSICS_DECL__ void __stwb(signed char *ptr, signed char value) { asm ("st.global.wb.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
382
+ __SM_32_INTRINSICS_DECL__ void __stwb(short *ptr, short value) { asm ("st.global.wb.s16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); }
383
+ __SM_32_INTRINSICS_DECL__ void __stwb(int *ptr, int value) { asm ("st.global.wb.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
384
+ __SM_32_INTRINSICS_DECL__ void __stwb(long long *ptr, long long value) { asm ("st.global.wb.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
385
+ __SM_32_INTRINSICS_DECL__ void __stwb(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.wb.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
386
+ __SM_32_INTRINSICS_DECL__ void __stwb(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wb.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
387
+ __SM_32_INTRINSICS_DECL__ void __stwb(short2 *ptr, short2 value) { asm ("st.global.wb.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
388
+ __SM_32_INTRINSICS_DECL__ void __stwb(short4 *ptr, short4 value) { asm ("st.global.wb.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
389
+ __SM_32_INTRINSICS_DECL__ void __stwb(int2 *ptr, int2 value) { asm ("st.global.wb.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
390
+ __SM_32_INTRINSICS_DECL__ void __stwb(int4 *ptr, int4 value) { asm ("st.global.wb.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
391
+ __SM_32_INTRINSICS_DECL__ void __stwb(longlong2 *ptr, longlong2 value) { asm ("st.global.wb.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
392
+
393
+ __SM_32_INTRINSICS_DECL__ void __stwb(unsigned char *ptr, unsigned char value) { asm ("st.global.wb.u8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
394
+ __SM_32_INTRINSICS_DECL__ void __stwb(unsigned short *ptr, unsigned short value) { asm ("st.global.wb.u16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); }
395
+ __SM_32_INTRINSICS_DECL__ void __stwb(unsigned int *ptr, unsigned int value) { asm ("st.global.wb.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
396
+ __SM_32_INTRINSICS_DECL__ void __stwb(unsigned long long *ptr, unsigned long long value) { asm ("st.global.wb.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
397
+ __SM_32_INTRINSICS_DECL__ void __stwb(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.wb.v2.u8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
398
+ __SM_32_INTRINSICS_DECL__ void __stwb(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wb.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
399
+ __SM_32_INTRINSICS_DECL__ void __stwb(ushort2 *ptr, ushort2 value) { asm ("st.global.wb.v2.u16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
400
+ __SM_32_INTRINSICS_DECL__ void __stwb(ushort4 *ptr, ushort4 value) { asm ("st.global.wb.v4.u16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
401
+ __SM_32_INTRINSICS_DECL__ void __stwb(uint2 *ptr, uint2 value) { asm ("st.global.wb.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
402
+ __SM_32_INTRINSICS_DECL__ void __stwb(uint4 *ptr, uint4 value) { asm ("st.global.wb.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
403
+ __SM_32_INTRINSICS_DECL__ void __stwb(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.wb.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
404
+
405
+ __SM_32_INTRINSICS_DECL__ void __stwb(float *ptr, float value) { asm ("st.global.wb.f32 [%0], %1;" :: __LDG_PTR (ptr), "f"(value) : "memory"); }
406
+ __SM_32_INTRINSICS_DECL__ void __stwb(double *ptr, double value) { asm ("st.global.wb.f64 [%0], %1;" :: __LDG_PTR (ptr), "d"(value) : "memory"); }
407
+ __SM_32_INTRINSICS_DECL__ void __stwb(float2 *ptr, float2 value) { asm ("st.global.wb.v2.f32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
408
+ __SM_32_INTRINSICS_DECL__ void __stwb(float4 *ptr, float4 value) { asm ("st.global.wb.v4.f32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
409
+ __SM_32_INTRINSICS_DECL__ void __stwb(double2 *ptr, double2 value) { asm ("st.global.wb.v2.f64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
410
+
411
+ /******************************************************************************
412
+ * __stcg *
413
+ ******************************************************************************/
414
+
415
+ // Size of long is architecture and OS specific.
416
+ #if defined(__LP64__) // 64 bits
417
+ __SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) { asm ("st.global.cg.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
418
+ __SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) { asm ("st.global.cg.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
419
+ #else // 32 bits
420
+ __SM_32_INTRINSICS_DECL__ void __stcg(long *ptr, long value) { asm ("st.global.cg.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
421
+ __SM_32_INTRINSICS_DECL__ void __stcg(unsigned long *ptr, unsigned long value) { asm ("st.global.cg.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
422
+ #endif
423
+
424
+
425
+ __SM_32_INTRINSICS_DECL__ void __stcg(char *ptr, char value) { asm ("st.global.cg.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
426
+ __SM_32_INTRINSICS_DECL__ void __stcg(signed char *ptr, signed char value) { asm ("st.global.cg.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
427
+ __SM_32_INTRINSICS_DECL__ void __stcg(short *ptr, short value) { asm ("st.global.cg.s16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); }
428
+ __SM_32_INTRINSICS_DECL__ void __stcg(int *ptr, int value) { asm ("st.global.cg.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
429
+ __SM_32_INTRINSICS_DECL__ void __stcg(long long *ptr, long long value) { asm ("st.global.cg.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
430
+ __SM_32_INTRINSICS_DECL__ void __stcg(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.cg.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
431
+ __SM_32_INTRINSICS_DECL__ void __stcg(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cg.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
432
+ __SM_32_INTRINSICS_DECL__ void __stcg(short2 *ptr, short2 value) { asm ("st.global.cg.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
433
+ __SM_32_INTRINSICS_DECL__ void __stcg(short4 *ptr, short4 value) { asm ("st.global.cg.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
434
+ __SM_32_INTRINSICS_DECL__ void __stcg(int2 *ptr, int2 value) { asm ("st.global.cg.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
435
+ __SM_32_INTRINSICS_DECL__ void __stcg(int4 *ptr, int4 value) { asm ("st.global.cg.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
436
+ __SM_32_INTRINSICS_DECL__ void __stcg(longlong2 *ptr, longlong2 value) { asm ("st.global.cg.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
437
+
438
+ __SM_32_INTRINSICS_DECL__ void __stcg(unsigned char *ptr, unsigned char value) { asm ("st.global.cg.u8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
439
+ __SM_32_INTRINSICS_DECL__ void __stcg(unsigned short *ptr, unsigned short value) { asm ("st.global.cg.u16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); }
440
+ __SM_32_INTRINSICS_DECL__ void __stcg(unsigned int *ptr, unsigned int value) { asm ("st.global.cg.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
441
+ __SM_32_INTRINSICS_DECL__ void __stcg(unsigned long long *ptr, unsigned long long value) { asm ("st.global.cg.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
442
+ __SM_32_INTRINSICS_DECL__ void __stcg(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.cg.v2.u8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
443
+ __SM_32_INTRINSICS_DECL__ void __stcg(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cg.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
444
+ __SM_32_INTRINSICS_DECL__ void __stcg(ushort2 *ptr, ushort2 value) { asm ("st.global.cg.v2.u16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
445
+ __SM_32_INTRINSICS_DECL__ void __stcg(ushort4 *ptr, ushort4 value) { asm ("st.global.cg.v4.u16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
446
+ __SM_32_INTRINSICS_DECL__ void __stcg(uint2 *ptr, uint2 value) { asm ("st.global.cg.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
447
+ __SM_32_INTRINSICS_DECL__ void __stcg(uint4 *ptr, uint4 value) { asm ("st.global.cg.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
448
+ __SM_32_INTRINSICS_DECL__ void __stcg(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.cg.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
449
+
450
+ __SM_32_INTRINSICS_DECL__ void __stcg(float *ptr, float value) { asm ("st.global.cg.f32 [%0], %1;" :: __LDG_PTR (ptr), "f"(value) : "memory"); }
451
+ __SM_32_INTRINSICS_DECL__ void __stcg(double *ptr, double value) { asm ("st.global.cg.f64 [%0], %1;" :: __LDG_PTR (ptr), "d"(value) : "memory"); }
452
+ __SM_32_INTRINSICS_DECL__ void __stcg(float2 *ptr, float2 value) { asm ("st.global.cg.v2.f32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
453
+ __SM_32_INTRINSICS_DECL__ void __stcg(float4 *ptr, float4 value) { asm ("st.global.cg.v4.f32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
454
+ __SM_32_INTRINSICS_DECL__ void __stcg(double2 *ptr, double2 value) { asm ("st.global.cg.v2.f64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
455
+
456
+ /******************************************************************************
457
+ * __stcs *
458
+ ******************************************************************************/
459
+
460
+ // Size of long is architecture and OS specific.
461
+ #if defined(__LP64__) // 64 bits
462
+ __SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) { asm ("st.global.cs.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
463
+ __SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) { asm ("st.global.cs.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
464
+ #else // 32 bits
465
+ __SM_32_INTRINSICS_DECL__ void __stcs(long *ptr, long value) { asm ("st.global.cs.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
466
+ __SM_32_INTRINSICS_DECL__ void __stcs(unsigned long *ptr, unsigned long value) { asm ("st.global.cs.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
467
+ #endif
468
+
469
+
470
+ __SM_32_INTRINSICS_DECL__ void __stcs(char *ptr, char value) { asm ("st.global.cs.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
471
+ __SM_32_INTRINSICS_DECL__ void __stcs(signed char *ptr, signed char value) { asm ("st.global.cs.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
472
+ __SM_32_INTRINSICS_DECL__ void __stcs(short *ptr, short value) { asm ("st.global.cs.s16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); }
473
+ __SM_32_INTRINSICS_DECL__ void __stcs(int *ptr, int value) { asm ("st.global.cs.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
474
+ __SM_32_INTRINSICS_DECL__ void __stcs(long long *ptr, long long value) { asm ("st.global.cs.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
475
+ __SM_32_INTRINSICS_DECL__ void __stcs(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.cs.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
476
+ __SM_32_INTRINSICS_DECL__ void __stcs(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cs.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
477
+ __SM_32_INTRINSICS_DECL__ void __stcs(short2 *ptr, short2 value) { asm ("st.global.cs.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
478
+ __SM_32_INTRINSICS_DECL__ void __stcs(short4 *ptr, short4 value) { asm ("st.global.cs.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
479
+ __SM_32_INTRINSICS_DECL__ void __stcs(int2 *ptr, int2 value) { asm ("st.global.cs.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
480
+ __SM_32_INTRINSICS_DECL__ void __stcs(int4 *ptr, int4 value) { asm ("st.global.cs.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
481
+ __SM_32_INTRINSICS_DECL__ void __stcs(longlong2 *ptr, longlong2 value) { asm ("st.global.cs.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
482
+
483
+ __SM_32_INTRINSICS_DECL__ void __stcs(unsigned char *ptr, unsigned char value) { asm ("st.global.cs.u8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
484
+ __SM_32_INTRINSICS_DECL__ void __stcs(unsigned short *ptr, unsigned short value) { asm ("st.global.cs.u16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); }
485
+ __SM_32_INTRINSICS_DECL__ void __stcs(unsigned int *ptr, unsigned int value) { asm ("st.global.cs.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
486
+ __SM_32_INTRINSICS_DECL__ void __stcs(unsigned long long *ptr, unsigned long long value) { asm ("st.global.cs.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
487
+ __SM_32_INTRINSICS_DECL__ void __stcs(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.cs.v2.u8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
488
+ __SM_32_INTRINSICS_DECL__ void __stcs(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.cs.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
489
+ __SM_32_INTRINSICS_DECL__ void __stcs(ushort2 *ptr, ushort2 value) { asm ("st.global.cs.v2.u16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
490
+ __SM_32_INTRINSICS_DECL__ void __stcs(ushort4 *ptr, ushort4 value) { asm ("st.global.cs.v4.u16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
491
+ __SM_32_INTRINSICS_DECL__ void __stcs(uint2 *ptr, uint2 value) { asm ("st.global.cs.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
492
+ __SM_32_INTRINSICS_DECL__ void __stcs(uint4 *ptr, uint4 value) { asm ("st.global.cs.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
493
+ __SM_32_INTRINSICS_DECL__ void __stcs(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.cs.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
494
+
495
+ __SM_32_INTRINSICS_DECL__ void __stcs(float *ptr, float value) { asm ("st.global.cs.f32 [%0], %1;" :: __LDG_PTR (ptr), "f"(value) : "memory"); }
496
+ __SM_32_INTRINSICS_DECL__ void __stcs(double *ptr, double value) { asm ("st.global.cs.f64 [%0], %1;" :: __LDG_PTR (ptr), "d"(value) : "memory"); }
497
+ __SM_32_INTRINSICS_DECL__ void __stcs(float2 *ptr, float2 value) { asm ("st.global.cs.v2.f32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
498
+ __SM_32_INTRINSICS_DECL__ void __stcs(float4 *ptr, float4 value) { asm ("st.global.cs.v4.f32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
499
+ __SM_32_INTRINSICS_DECL__ void __stcs(double2 *ptr, double2 value) { asm ("st.global.cs.v2.f64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
500
+
501
+ /******************************************************************************
502
+ * __stwt *
503
+ ******************************************************************************/
504
+
505
+ // Size of long is architecture and OS specific.
506
+ #if defined(__LP64__) // 64 bits
507
+ __SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) { asm ("st.global.wt.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
508
+ __SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) { asm ("st.global.wt.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
509
+ #else // 32 bits
510
+ __SM_32_INTRINSICS_DECL__ void __stwt(long *ptr, long value) { asm ("st.global.wt.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
511
+ __SM_32_INTRINSICS_DECL__ void __stwt(unsigned long *ptr, unsigned long value) { asm ("st.global.wt.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
512
+ #endif
513
+
514
+
515
+ __SM_32_INTRINSICS_DECL__ void __stwt(char *ptr, char value) { asm ("st.global.wt.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
516
+ __SM_32_INTRINSICS_DECL__ void __stwt(signed char *ptr, signed char value) { asm ("st.global.wt.s8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
517
+ __SM_32_INTRINSICS_DECL__ void __stwt(short *ptr, short value) { asm ("st.global.wt.s16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); }
518
+ __SM_32_INTRINSICS_DECL__ void __stwt(int *ptr, int value) { asm ("st.global.wt.s32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
519
+ __SM_32_INTRINSICS_DECL__ void __stwt(long long *ptr, long long value) { asm ("st.global.wt.s64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
520
+ __SM_32_INTRINSICS_DECL__ void __stwt(char2 *ptr, char2 value) { const int x = value.x, y = value.y; asm ("st.global.wt.v2.s8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
521
+ __SM_32_INTRINSICS_DECL__ void __stwt(char4 *ptr, char4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wt.v4.s8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
522
+ __SM_32_INTRINSICS_DECL__ void __stwt(short2 *ptr, short2 value) { asm ("st.global.wt.v2.s16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
523
+ __SM_32_INTRINSICS_DECL__ void __stwt(short4 *ptr, short4 value) { asm ("st.global.wt.v4.s16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
524
+ __SM_32_INTRINSICS_DECL__ void __stwt(int2 *ptr, int2 value) { asm ("st.global.wt.v2.s32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
525
+ __SM_32_INTRINSICS_DECL__ void __stwt(int4 *ptr, int4 value) { asm ("st.global.wt.v4.s32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
526
+ __SM_32_INTRINSICS_DECL__ void __stwt(longlong2 *ptr, longlong2 value) { asm ("st.global.wt.v2.s64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
527
+
528
+ __SM_32_INTRINSICS_DECL__ void __stwt(unsigned char *ptr, unsigned char value) { asm ("st.global.wt.u8 [%0], %1;" :: __LDG_PTR (ptr), "r"((int)value) : "memory"); }
529
+ __SM_32_INTRINSICS_DECL__ void __stwt(unsigned short *ptr, unsigned short value) { asm ("st.global.wt.u16 [%0], %1;" :: __LDG_PTR (ptr), "h"(value) : "memory"); }
530
+ __SM_32_INTRINSICS_DECL__ void __stwt(unsigned int *ptr, unsigned int value) { asm ("st.global.wt.u32 [%0], %1;" :: __LDG_PTR (ptr), "r"(value) : "memory"); }
531
+ __SM_32_INTRINSICS_DECL__ void __stwt(unsigned long long *ptr, unsigned long long value) { asm ("st.global.wt.u64 [%0], %1;" :: __LDG_PTR (ptr), "l"(value) : "memory"); }
532
+ __SM_32_INTRINSICS_DECL__ void __stwt(uchar2 *ptr, uchar2 value) { const int x = value.x, y = value.y; asm ("st.global.wt.v2.u8 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(x), "r"(y) : "memory"); }
533
+ __SM_32_INTRINSICS_DECL__ void __stwt(uchar4 *ptr, uchar4 value) { const int x = value.x, y = value.y, z = value.z, w = value.w; asm ("st.global.wt.v4.u8 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(x), "r"(y), "r"(z), "r"(w) : "memory"); }
534
+ __SM_32_INTRINSICS_DECL__ void __stwt(ushort2 *ptr, ushort2 value) { asm ("st.global.wt.v2.u16 [%0], {%1,%2};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y) : "memory"); }
535
+ __SM_32_INTRINSICS_DECL__ void __stwt(ushort4 *ptr, ushort4 value) { asm ("st.global.wt.v4.u16 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "h"(value.x), "h"(value.y), "h"(value.z), "h"(value.w) : "memory"); }
536
+ __SM_32_INTRINSICS_DECL__ void __stwt(uint2 *ptr, uint2 value) { asm ("st.global.wt.v2.u32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y) : "memory"); }
537
+ __SM_32_INTRINSICS_DECL__ void __stwt(uint4 *ptr, uint4 value) { asm ("st.global.wt.v4.u32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w) : "memory"); }
538
+ __SM_32_INTRINSICS_DECL__ void __stwt(ulonglong2 *ptr, ulonglong2 value) { asm ("st.global.wt.v2.u64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "l"(value.x), "l"(value.y) : "memory"); }
539
+
540
+ __SM_32_INTRINSICS_DECL__ void __stwt(float *ptr, float value) { asm ("st.global.wt.f32 [%0], %1;" :: __LDG_PTR (ptr), "f"(value) : "memory"); }
541
+ __SM_32_INTRINSICS_DECL__ void __stwt(double *ptr, double value) { asm ("st.global.wt.f64 [%0], %1;" :: __LDG_PTR (ptr), "d"(value) : "memory"); }
542
+ __SM_32_INTRINSICS_DECL__ void __stwt(float2 *ptr, float2 value) { asm ("st.global.wt.v2.f32 [%0], {%1,%2};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y) : "memory"); }
543
+ __SM_32_INTRINSICS_DECL__ void __stwt(float4 *ptr, float4 value) { asm ("st.global.wt.v4.f32 [%0], {%1,%2,%3,%4};" :: __LDG_PTR (ptr), "f"(value.x), "f"(value.y), "f"(value.z), "f"(value.w) : "memory"); }
544
+ __SM_32_INTRINSICS_DECL__ void __stwt(double2 *ptr, double2 value) { asm ("st.global.wt.v2.f64 [%0], {%1,%2};" :: __LDG_PTR (ptr), "d"(value.x), "d"(value.y) : "memory"); }
545
+
546
+ #undef __LDG_PTR
547
+
548
+
549
+ // SHF is the "funnel shift" operation - an accelerated left/right shift with carry
550
+ // operating on 64-bit quantities, which are concatenations of two 32-bit registers.
551
+
552
+ // This shifts [b:a] left by "shift" bits, returning the most significant bits of the result.
553
+ __SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_l(unsigned int lo, unsigned int hi, unsigned int shift)
554
+ {
555
+ unsigned int ret;
556
+ asm volatile ("shf.l.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
557
+ return ret;
558
+ }
559
+ __SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_lc(unsigned int lo, unsigned int hi, unsigned int shift)
560
+ {
561
+ unsigned int ret;
562
+ asm volatile ("shf.l.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
563
+ return ret;
564
+ }
565
+
566
+ // This shifts [b:a] right by "shift" bits, returning the least significant bits of the result.
567
+ __SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_r(unsigned int lo, unsigned int hi, unsigned int shift)
568
+ {
569
+ unsigned int ret;
570
+ asm volatile ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
571
+ return ret;
572
+ }
573
+ __SM_32_INTRINSICS_DECL__ unsigned int __funnelshift_rc(unsigned int lo, unsigned int hi, unsigned int shift)
574
+ {
575
+ unsigned int ret;
576
+ asm volatile ("shf.r.clamp.b32 %0, %1, %2, %3;" : "=r"(ret) : "r"(lo), "r"(hi), "r"(shift));
577
+ return ret;
578
+ }
579
+
580
+
581
+ #endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
582
+
583
+ #endif /* __cplusplus && __CUDACC__ */
584
+
585
+ #undef __SM_32_INTRINSICS_DECL__
586
+
587
+ #endif /* !__SM_32_INTRINSICS_HPP__ */
588
+
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_60_atomic_functions.h ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2023 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ //NOTE: For NVRTC, these declarations have been moved into the compiler (to reduce compile time)
51
+ #define EXCLUDE_FROM_RTC
52
+
53
+ #if !defined(__SM_60_ATOMIC_FUNCTIONS_H__)
54
+ #define __SM_60_ATOMIC_FUNCTIONS_H__
55
+
56
+
57
+ #if defined(__CUDACC_RTC__)
58
+ #define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
59
+ #elif defined(_NVHPC_CUDA)
60
+ #define __SM_60_ATOMIC_FUNCTIONS_DECL__ extern __device__ __cudart_builtin__
61
+ #else /* __CUDACC_RTC__ */
62
+ #define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
63
+ #endif /* __CUDACC_RTC__ */
64
+
65
+ #if defined(__cplusplus) && defined(__CUDACC__)
66
+
67
+ #if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
68
+
69
+ /*******************************************************************************
70
+ * *
71
+ * *
72
+ * *
73
+ *******************************************************************************/
74
+
75
+ #include "cuda_runtime_api.h"
76
+
77
+ /* Add !defined(_NVHPC_CUDA) to avoid empty function definition in CUDA
78
+ * C++ compiler where the macro __CUDA_ARCH__ is not defined. */
79
+ #if !defined(__CUDA_ARCH__) && !defined(_NVHPC_CUDA)
80
+ #define __DEF_IF_HOST { }
81
+ #else /* !__CUDA_ARCH__ */
82
+ #define __DEF_IF_HOST ;
83
+ #endif /* __CUDA_ARCH__ */
84
+
85
+
86
+
87
+ /*******************************************************************************
88
+ * *
89
+ * *
90
+ * *
91
+ *******************************************************************************/
92
+
93
+ __SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val) __DEF_IF_HOST
94
+
95
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
96
+ int atomicAdd_block(int *address, int val) __DEF_IF_HOST
97
+
98
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
99
+ int atomicAdd_system(int *address, int val) __DEF_IF_HOST
100
+
101
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
102
+ unsigned int atomicAdd_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
103
+
104
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
105
+ unsigned int atomicAdd_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
106
+
107
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
108
+ unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
109
+
110
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
111
+ unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
112
+
113
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
114
+ float atomicAdd_block(float *address, float val) __DEF_IF_HOST
115
+
116
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
117
+ float atomicAdd_system(float *address, float val) __DEF_IF_HOST
118
+
119
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
120
+ double atomicAdd_block(double *address, double val) __DEF_IF_HOST
121
+
122
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
123
+ double atomicAdd_system(double *address, double val) __DEF_IF_HOST
124
+
125
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
126
+ int atomicSub_block(int *address, int val) __DEF_IF_HOST
127
+
128
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
129
+ int atomicSub_system(int *address, int val) __DEF_IF_HOST
130
+
131
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
132
+ unsigned int atomicSub_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
133
+
134
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
135
+ unsigned int atomicSub_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
136
+
137
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
138
+ int atomicExch_block(int *address, int val) __DEF_IF_HOST
139
+
140
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
141
+ int atomicExch_system(int *address, int val) __DEF_IF_HOST
142
+
143
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
144
+ unsigned int atomicExch_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
145
+
146
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
147
+ unsigned int atomicExch_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
148
+
149
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
150
+ unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
151
+
152
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
153
+ unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
154
+
155
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
156
+ float atomicExch_block(float *address, float val) __DEF_IF_HOST
157
+
158
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
159
+ float atomicExch_system(float *address, float val) __DEF_IF_HOST
160
+
161
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
162
+ int atomicMin_block(int *address, int val) __DEF_IF_HOST
163
+
164
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
165
+ int atomicMin_system(int *address, int val) __DEF_IF_HOST
166
+
167
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
168
+ long long atomicMin_block(long long *address, long long val) __DEF_IF_HOST
169
+
170
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
171
+ long long atomicMin_system(long long *address, long long val) __DEF_IF_HOST
172
+
173
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
174
+ unsigned int atomicMin_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
175
+
176
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
177
+ unsigned int atomicMin_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
178
+
179
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
180
+ unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
181
+
182
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
183
+ unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
184
+
185
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
186
+ int atomicMax_block(int *address, int val) __DEF_IF_HOST
187
+
188
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
189
+ int atomicMax_system(int *address, int val) __DEF_IF_HOST
190
+
191
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
192
+ long long atomicMax_block(long long *address, long long val) __DEF_IF_HOST
193
+
194
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
195
+ long long atomicMax_system(long long *address, long long val) __DEF_IF_HOST
196
+
197
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
198
+ unsigned int atomicMax_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
199
+
200
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
201
+ unsigned int atomicMax_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
202
+
203
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
204
+ unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
205
+
206
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
207
+ unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
208
+
209
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
210
+ unsigned int atomicInc_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
211
+
212
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
213
+ unsigned int atomicInc_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
214
+
215
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
216
+ unsigned int atomicDec_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
217
+
218
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
219
+ unsigned int atomicDec_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
220
+
221
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
222
+ int atomicCAS_block(int *address, int compare, int val) __DEF_IF_HOST
223
+
224
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
225
+ int atomicCAS_system(int *address, int compare, int val) __DEF_IF_HOST
226
+
227
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
228
+ unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
229
+ unsigned int val) __DEF_IF_HOST
230
+
231
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
232
+ unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
233
+ unsigned int val) __DEF_IF_HOST
234
+
235
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
236
+ unsigned long long int atomicCAS_block(unsigned long long int *address,
237
+ unsigned long long int compare,
238
+ unsigned long long int val) __DEF_IF_HOST
239
+
240
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
241
+ unsigned long long int atomicCAS_system(unsigned long long int *address,
242
+ unsigned long long int compare,
243
+ unsigned long long int val) __DEF_IF_HOST
244
+
245
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
246
+ int atomicAnd_block(int *address, int val) __DEF_IF_HOST
247
+
248
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
249
+ int atomicAnd_system(int *address, int val) __DEF_IF_HOST
250
+
251
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
252
+ long long atomicAnd_block(long long *address, long long val) __DEF_IF_HOST
253
+
254
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
255
+ long long atomicAnd_system(long long *address, long long val) __DEF_IF_HOST
256
+
257
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
258
+ unsigned int atomicAnd_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
259
+
260
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
261
+ unsigned int atomicAnd_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
262
+
263
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
264
+ unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
265
+
266
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
267
+ unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
268
+
269
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
270
+ int atomicOr_block(int *address, int val) __DEF_IF_HOST
271
+
272
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
273
+ int atomicOr_system(int *address, int val) __DEF_IF_HOST
274
+
275
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
276
+ long long atomicOr_block(long long *address, long long val) __DEF_IF_HOST
277
+
278
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
279
+ long long atomicOr_system(long long *address, long long val) __DEF_IF_HOST
280
+
281
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
282
+ unsigned int atomicOr_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
283
+
284
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
285
+ unsigned int atomicOr_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
286
+
287
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
288
+ unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
289
+
290
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
291
+ unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
292
+
293
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
294
+ int atomicXor_block(int *address, int val) __DEF_IF_HOST
295
+
296
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
297
+ int atomicXor_system(int *address, int val) __DEF_IF_HOST
298
+
299
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
300
+ long long atomicXor_block(long long *address, long long val) __DEF_IF_HOST
301
+
302
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
303
+ long long atomicXor_system(long long *address, long long val) __DEF_IF_HOST
304
+
305
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
306
+ unsigned int atomicXor_block(unsigned int *address, unsigned int val) __DEF_IF_HOST
307
+
308
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
309
+ unsigned int atomicXor_system(unsigned int *address, unsigned int val) __DEF_IF_HOST
310
+
311
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
312
+ unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
313
+
314
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
315
+ unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val) __DEF_IF_HOST
316
+
317
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
318
+
319
+ #endif /* __cplusplus && __CUDACC__ */
320
+
321
+ #undef __SM_60_ATOMIC_FUNCTIONS_DECL__
322
+ #undef __DEF_IF_HOST
323
+
324
+ #if !defined(__CUDACC_RTC__) && defined(__CUDA_ARCH__)
325
+ #include "sm_60_atomic_functions.hpp"
326
+ #endif /* !__CUDACC_RTC__ && defined(__CUDA_ARCH__) */
327
+
328
+ #endif /* !__SM_60_ATOMIC_FUNCTIONS_H__ */
329
+
330
+ #undef EXCLUDE_FROM_RTC
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/triton/backends/nvidia/include/sm_61_intrinsics.hpp ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2016 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_61_INTRINSICS_HPP__)
51
+ #define __SM_61_INTRINSICS_HPP__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __SM_61_INTRINSICS_DECL__ __device__
55
+ #else /* !__CUDACC_RTC__ */
56
+ #define __SM_61_INTRINSICS_DECL__ static __device__ __inline__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ #if defined(_NVHPC_CUDA) || !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 610
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #include "cuda_runtime_api.h"
70
+
71
+ /*******************************************************************************
72
+ * *
73
+ * Below are implementations of SM-6.1 intrinsics which are included as *
74
+ * source (instead of being built in to the compiler) *
75
+ * *
76
+ *******************************************************************************/
77
+
78
+ // 4a
79
+ __SM_61_INTRINSICS_DECL__ int __dp4a(int srcA, int srcB, int c) {
80
+ int ret;
81
+ asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
82
+ return ret;
83
+ }
84
+
85
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp4a(unsigned int srcA, unsigned int srcB, unsigned int c) {
86
+ unsigned int ret;
87
+ asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
88
+ return ret;
89
+ }
90
+
91
+ __SM_61_INTRINSICS_DECL__ int __dp4a(char4 srcA, char4 srcB, int c) {
92
+ int ret;
93
+ asm volatile ("dp4a.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
94
+ return ret;
95
+ }
96
+
97
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp4a(uchar4 srcA, uchar4 srcB, unsigned int c) {
98
+ unsigned int ret;
99
+ asm volatile ("dp4a.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
100
+ return ret;
101
+ }
102
+
103
+ // 2a.lo
104
+ __SM_61_INTRINSICS_DECL__ int __dp2a_lo(int srcA, int srcB, int c) {
105
+ int ret;
106
+ asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
107
+ return ret;
108
+ }
109
+
110
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(unsigned int srcA, unsigned int srcB, unsigned int c) {
111
+ unsigned int ret;
112
+ asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
113
+ return ret;
114
+ }
115
+
116
+ __SM_61_INTRINSICS_DECL__ int __dp2a_lo(short2 srcA, char4 srcB, int c) {
117
+ int ret;
118
+ asm volatile ("dp2a.lo.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
119
+ return ret;
120
+ }
121
+
122
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp2a_lo(ushort2 srcA, uchar4 srcB, unsigned int c) {
123
+ unsigned int ret;
124
+ asm volatile ("dp2a.lo.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
125
+ return ret;
126
+ }
127
+
128
+ // 2a.hi
129
+ __SM_61_INTRINSICS_DECL__ int __dp2a_hi(int srcA, int srcB, int c) {
130
+ int ret;
131
+ asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
132
+ return ret;
133
+ }
134
+
135
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(unsigned int srcA, unsigned int srcB, unsigned int c) {
136
+ unsigned int ret;
137
+ asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(srcA), "r"(srcB), "r"(c));
138
+ return ret;
139
+ }
140
+
141
+ __SM_61_INTRINSICS_DECL__ int __dp2a_hi(short2 srcA, char4 srcB, int c) {
142
+ int ret;
143
+ asm volatile ("dp2a.hi.s32.s32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(int *)&srcA), "r"(*(int *)&srcB), "r"(c));
144
+ return ret;
145
+ }
146
+
147
+ __SM_61_INTRINSICS_DECL__ unsigned int __dp2a_hi(ushort2 srcA, uchar4 srcB, unsigned int c) {
148
+ unsigned int ret;
149
+ asm volatile ("dp2a.hi.u32.u32 %0, %1, %2, %3;" : "=r"(ret) : "r"(*(unsigned int *)&srcA), "r"(*(unsigned int *)&srcB), "r"(c));
150
+ return ret;
151
+ }
152
+
153
+
154
+ #endif /* _NVHPC_CUDA || !__CUDA_ARCH__ || __CUDA_ARCH__ >= 610 */
155
+
156
+ #endif /* __cplusplus && __CUDACC__ */
157
+
158
+ #undef __SM_61_INTRINSICS_DECL__
159
+
160
+ #endif /* !__SM_61_INTRINSICS_HPP__ */
161
+
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (6.96 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/convert.cpython-312.pyc ADDED
Binary file (16.2 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/pack.cpython-312.pyc ADDED
Binary file (4.49 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/tags.cpython-312.pyc ADDED
Binary file (6.78 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/cli/__pycache__/unpack.cpython-312.pyc ADDED
Binary file (1.56 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (220 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/LICENSE ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ This software is made available under the terms of *either* of the licenses
2
+ found in LICENSE.APACHE or LICENSE.BSD. Contributions to this software is made
3
+ under the terms of *both* these licenses.
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/LICENSE.APACHE ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/LICENSE.BSD ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Copyright (c) Donald Stufft and individual contributors.
2
+ All rights reserved.
3
+
4
+ Redistribution and use in source and binary forms, with or without
5
+ modification, are permitted provided that the following conditions are met:
6
+
7
+ 1. Redistributions of source code must retain the above copyright notice,
8
+ this list of conditions and the following disclaimer.
9
+
10
+ 2. Redistributions in binary form must reproduce the above copyright
11
+ notice, this list of conditions and the following disclaimer in the
12
+ documentation and/or other materials provided with the distribution.
13
+
14
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
15
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
16
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
17
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
18
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
20
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
21
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
22
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__init__.py ADDED
File without changes
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (230 Bytes). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_elffile.cpython-312.pyc ADDED
Binary file (5.06 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_manylinux.cpython-312.pyc ADDED
Binary file (9.93 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_musllinux.cpython-312.pyc ADDED
Binary file (4.61 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_parser.cpython-312.pyc ADDED
Binary file (14.1 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_structures.cpython-312.pyc ADDED
Binary file (3.28 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/_tokenizer.cpython-312.pyc ADDED
Binary file (7.97 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/markers.cpython-312.pyc ADDED
Binary file (10.6 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/requirements.cpython-312.pyc ADDED
Binary file (4.49 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/specifiers.cpython-312.pyc ADDED
Binary file (39.6 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/tags.cpython-312.pyc ADDED
Binary file (21.8 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/utils.cpython-312.pyc ADDED
Binary file (7.32 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/__pycache__/version.cpython-312.pyc ADDED
Binary file (20 kB). View file
 
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_elffile.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ELF file parser.
3
+
4
+ This provides a class ``ELFFile`` that parses an ELF executable in a similar
5
+ interface to ``ZipFile``. Only the read interface is implemented.
6
+
7
+ Based on: https://gist.github.com/lyssdod/f51579ae8d93c8657a5564aefc2ffbca
8
+ ELF header: https://refspecs.linuxfoundation.org/elf/gabi4+/ch4.eheader.html
9
+ """
10
+
11
+ import enum
12
+ import os
13
+ import struct
14
+ from typing import IO, Optional, Tuple
15
+
16
+
17
+ class ELFInvalid(ValueError):
18
+ pass
19
+
20
+
21
+ class EIClass(enum.IntEnum):
22
+ C32 = 1
23
+ C64 = 2
24
+
25
+
26
+ class EIData(enum.IntEnum):
27
+ Lsb = 1
28
+ Msb = 2
29
+
30
+
31
+ class EMachine(enum.IntEnum):
32
+ I386 = 3
33
+ S390 = 22
34
+ Arm = 40
35
+ X8664 = 62
36
+ AArc64 = 183
37
+
38
+
39
+ class ELFFile:
40
+ """
41
+ Representation of an ELF executable.
42
+ """
43
+
44
+ def __init__(self, f: IO[bytes]) -> None:
45
+ self._f = f
46
+
47
+ try:
48
+ ident = self._read("16B")
49
+ except struct.error:
50
+ raise ELFInvalid("unable to parse identification")
51
+ magic = bytes(ident[:4])
52
+ if magic != b"\x7fELF":
53
+ raise ELFInvalid(f"invalid magic: {magic!r}")
54
+
55
+ self.capacity = ident[4] # Format for program header (bitness).
56
+ self.encoding = ident[5] # Data structure encoding (endianness).
57
+
58
+ try:
59
+ # e_fmt: Format for program header.
60
+ # p_fmt: Format for section header.
61
+ # p_idx: Indexes to find p_type, p_offset, and p_filesz.
62
+ e_fmt, self._p_fmt, self._p_idx = {
63
+ (1, 1): ("<HHIIIIIHHH", "<IIIIIIII", (0, 1, 4)), # 32-bit LSB.
64
+ (1, 2): (">HHIIIIIHHH", ">IIIIIIII", (0, 1, 4)), # 32-bit MSB.
65
+ (2, 1): ("<HHIQQQIHHH", "<IIQQQQQQ", (0, 2, 5)), # 64-bit LSB.
66
+ (2, 2): (">HHIQQQIHHH", ">IIQQQQQQ", (0, 2, 5)), # 64-bit MSB.
67
+ }[(self.capacity, self.encoding)]
68
+ except KeyError:
69
+ raise ELFInvalid(
70
+ f"unrecognized capacity ({self.capacity}) or "
71
+ f"encoding ({self.encoding})"
72
+ )
73
+
74
+ try:
75
+ (
76
+ _,
77
+ self.machine, # Architecture type.
78
+ _,
79
+ _,
80
+ self._e_phoff, # Offset of program header.
81
+ _,
82
+ self.flags, # Processor-specific flags.
83
+ _,
84
+ self._e_phentsize, # Size of section.
85
+ self._e_phnum, # Number of sections.
86
+ ) = self._read(e_fmt)
87
+ except struct.error as e:
88
+ raise ELFInvalid("unable to parse machine and section information") from e
89
+
90
+ def _read(self, fmt: str) -> Tuple[int, ...]:
91
+ return struct.unpack(fmt, self._f.read(struct.calcsize(fmt)))
92
+
93
+ @property
94
+ def interpreter(self) -> Optional[str]:
95
+ """
96
+ The path recorded in the ``PT_INTERP`` section header.
97
+ """
98
+ for index in range(self._e_phnum):
99
+ self._f.seek(self._e_phoff + self._e_phentsize * index)
100
+ try:
101
+ data = self._read(self._p_fmt)
102
+ except struct.error:
103
+ continue
104
+ if data[self._p_idx[0]] != 3: # Not PT_INTERP.
105
+ continue
106
+ self._f.seek(data[self._p_idx[1]])
107
+ return os.fsdecode(self._f.read(data[self._p_idx[2]])).strip("\0")
108
+ return None
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_musllinux.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """PEP 656 support.
2
+
3
+ This module implements logic to detect if the currently running Python is
4
+ linked against musl, and what musl version is used.
5
+ """
6
+
7
+ import functools
8
+ import re
9
+ import subprocess
10
+ import sys
11
+ from typing import Iterator, NamedTuple, Optional, Sequence
12
+
13
+ from ._elffile import ELFFile
14
+
15
+
16
+ class _MuslVersion(NamedTuple):
17
+ major: int
18
+ minor: int
19
+
20
+
21
+ def _parse_musl_version(output: str) -> Optional[_MuslVersion]:
22
+ lines = [n for n in (n.strip() for n in output.splitlines()) if n]
23
+ if len(lines) < 2 or lines[0][:4] != "musl":
24
+ return None
25
+ m = re.match(r"Version (\d+)\.(\d+)", lines[1])
26
+ if not m:
27
+ return None
28
+ return _MuslVersion(major=int(m.group(1)), minor=int(m.group(2)))
29
+
30
+
31
+ @functools.lru_cache
32
+ def _get_musl_version(executable: str) -> Optional[_MuslVersion]:
33
+ """Detect currently-running musl runtime version.
34
+
35
+ This is done by checking the specified executable's dynamic linking
36
+ information, and invoking the loader to parse its output for a version
37
+ string. If the loader is musl, the output would be something like::
38
+
39
+ musl libc (x86_64)
40
+ Version 1.2.2
41
+ Dynamic Program Loader
42
+ """
43
+ try:
44
+ with open(executable, "rb") as f:
45
+ ld = ELFFile(f).interpreter
46
+ except (OSError, TypeError, ValueError):
47
+ return None
48
+ if ld is None or "musl" not in ld:
49
+ return None
50
+ proc = subprocess.run([ld], stderr=subprocess.PIPE, text=True)
51
+ return _parse_musl_version(proc.stderr)
52
+
53
+
54
+ def platform_tags(archs: Sequence[str]) -> Iterator[str]:
55
+ """Generate musllinux tags compatible to the current platform.
56
+
57
+ :param archs: Sequence of compatible architectures.
58
+ The first one shall be the closest to the actual architecture and be the part of
59
+ platform tag after the ``linux_`` prefix, e.g. ``x86_64``.
60
+ The ``linux_`` prefix is assumed as a prerequisite for the current platform to
61
+ be musllinux-compatible.
62
+
63
+ :returns: An iterator of compatible musllinux tags.
64
+ """
65
+ sys_musl = _get_musl_version(sys.executable)
66
+ if sys_musl is None: # Python not dynamically linked against musl.
67
+ return
68
+ for arch in archs:
69
+ for minor in range(sys_musl.minor, -1, -1):
70
+ yield f"musllinux_{sys_musl.major}_{minor}_{arch}"
71
+
72
+
73
+ if __name__ == "__main__": # pragma: no cover
74
+ import sysconfig
75
+
76
+ plat = sysconfig.get_platform()
77
+ assert plat.startswith("linux-"), "not linux"
78
+
79
+ print("plat:", plat)
80
+ print("musl:", _get_musl_version(sys.executable))
81
+ print("tags:", end=" ")
82
+ for t in platform_tags(re.sub(r"[.-]", "_", plat.split("-", 1)[-1])):
83
+ print(t, end="\n ")
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_parser.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Handwritten parser of dependency specifiers.
2
+
3
+ The docstring for each __parse_* function contains EBNF-inspired grammar representing
4
+ the implementation.
5
+ """
6
+
7
+ import ast
8
+ from typing import Any, List, NamedTuple, Optional, Tuple, Union
9
+
10
+ from ._tokenizer import DEFAULT_RULES, Tokenizer
11
+
12
+
13
+ class Node:
14
+ def __init__(self, value: str) -> None:
15
+ self.value = value
16
+
17
+ def __str__(self) -> str:
18
+ return self.value
19
+
20
+ def __repr__(self) -> str:
21
+ return f"<{self.__class__.__name__}('{self}')>"
22
+
23
+ def serialize(self) -> str:
24
+ raise NotImplementedError
25
+
26
+
27
+ class Variable(Node):
28
+ def serialize(self) -> str:
29
+ return str(self)
30
+
31
+
32
+ class Value(Node):
33
+ def serialize(self) -> str:
34
+ return f'"{self}"'
35
+
36
+
37
+ class Op(Node):
38
+ def serialize(self) -> str:
39
+ return str(self)
40
+
41
+
42
+ MarkerVar = Union[Variable, Value]
43
+ MarkerItem = Tuple[MarkerVar, Op, MarkerVar]
44
+ # MarkerAtom = Union[MarkerItem, List["MarkerAtom"]]
45
+ # MarkerList = List[Union["MarkerList", MarkerAtom, str]]
46
+ # mypy does not support recursive type definition
47
+ # https://github.com/python/mypy/issues/731
48
+ MarkerAtom = Any
49
+ MarkerList = List[Any]
50
+
51
+
52
+ class ParsedRequirement(NamedTuple):
53
+ name: str
54
+ url: str
55
+ extras: List[str]
56
+ specifier: str
57
+ marker: Optional[MarkerList]
58
+
59
+
60
+ # --------------------------------------------------------------------------------------
61
+ # Recursive descent parser for dependency specifier
62
+ # --------------------------------------------------------------------------------------
63
+ def parse_requirement(source: str) -> ParsedRequirement:
64
+ return _parse_requirement(Tokenizer(source, rules=DEFAULT_RULES))
65
+
66
+
67
+ def _parse_requirement(tokenizer: Tokenizer) -> ParsedRequirement:
68
+ """
69
+ requirement = WS? IDENTIFIER WS? extras WS? requirement_details
70
+ """
71
+ tokenizer.consume("WS")
72
+
73
+ name_token = tokenizer.expect(
74
+ "IDENTIFIER", expected="package name at the start of dependency specifier"
75
+ )
76
+ name = name_token.text
77
+ tokenizer.consume("WS")
78
+
79
+ extras = _parse_extras(tokenizer)
80
+ tokenizer.consume("WS")
81
+
82
+ url, specifier, marker = _parse_requirement_details(tokenizer)
83
+ tokenizer.expect("END", expected="end of dependency specifier")
84
+
85
+ return ParsedRequirement(name, url, extras, specifier, marker)
86
+
87
+
88
+ def _parse_requirement_details(
89
+ tokenizer: Tokenizer,
90
+ ) -> Tuple[str, str, Optional[MarkerList]]:
91
+ """
92
+ requirement_details = AT URL (WS requirement_marker?)?
93
+ | specifier WS? (requirement_marker)?
94
+ """
95
+
96
+ specifier = ""
97
+ url = ""
98
+ marker = None
99
+
100
+ if tokenizer.check("AT"):
101
+ tokenizer.read()
102
+ tokenizer.consume("WS")
103
+
104
+ url_start = tokenizer.position
105
+ url = tokenizer.expect("URL", expected="URL after @").text
106
+ if tokenizer.check("END", peek=True):
107
+ return (url, specifier, marker)
108
+
109
+ tokenizer.expect("WS", expected="whitespace after URL")
110
+
111
+ # The input might end after whitespace.
112
+ if tokenizer.check("END", peek=True):
113
+ return (url, specifier, marker)
114
+
115
+ marker = _parse_requirement_marker(
116
+ tokenizer, span_start=url_start, after="URL and whitespace"
117
+ )
118
+ else:
119
+ specifier_start = tokenizer.position
120
+ specifier = _parse_specifier(tokenizer)
121
+ tokenizer.consume("WS")
122
+
123
+ if tokenizer.check("END", peek=True):
124
+ return (url, specifier, marker)
125
+
126
+ marker = _parse_requirement_marker(
127
+ tokenizer,
128
+ span_start=specifier_start,
129
+ after=(
130
+ "version specifier"
131
+ if specifier
132
+ else "name and no valid version specifier"
133
+ ),
134
+ )
135
+
136
+ return (url, specifier, marker)
137
+
138
+
139
+ def _parse_requirement_marker(
140
+ tokenizer: Tokenizer, *, span_start: int, after: str
141
+ ) -> MarkerList:
142
+ """
143
+ requirement_marker = SEMICOLON marker WS?
144
+ """
145
+
146
+ if not tokenizer.check("SEMICOLON"):
147
+ tokenizer.raise_syntax_error(
148
+ f"Expected end or semicolon (after {after})",
149
+ span_start=span_start,
150
+ )
151
+ tokenizer.read()
152
+
153
+ marker = _parse_marker(tokenizer)
154
+ tokenizer.consume("WS")
155
+
156
+ return marker
157
+
158
+
159
+ def _parse_extras(tokenizer: Tokenizer) -> List[str]:
160
+ """
161
+ extras = (LEFT_BRACKET wsp* extras_list? wsp* RIGHT_BRACKET)?
162
+ """
163
+ if not tokenizer.check("LEFT_BRACKET", peek=True):
164
+ return []
165
+
166
+ with tokenizer.enclosing_tokens(
167
+ "LEFT_BRACKET",
168
+ "RIGHT_BRACKET",
169
+ around="extras",
170
+ ):
171
+ tokenizer.consume("WS")
172
+ extras = _parse_extras_list(tokenizer)
173
+ tokenizer.consume("WS")
174
+
175
+ return extras
176
+
177
+
178
+ def _parse_extras_list(tokenizer: Tokenizer) -> List[str]:
179
+ """
180
+ extras_list = identifier (wsp* ',' wsp* identifier)*
181
+ """
182
+ extras: List[str] = []
183
+
184
+ if not tokenizer.check("IDENTIFIER"):
185
+ return extras
186
+
187
+ extras.append(tokenizer.read().text)
188
+
189
+ while True:
190
+ tokenizer.consume("WS")
191
+ if tokenizer.check("IDENTIFIER", peek=True):
192
+ tokenizer.raise_syntax_error("Expected comma between extra names")
193
+ elif not tokenizer.check("COMMA"):
194
+ break
195
+
196
+ tokenizer.read()
197
+ tokenizer.consume("WS")
198
+
199
+ extra_token = tokenizer.expect("IDENTIFIER", expected="extra name after comma")
200
+ extras.append(extra_token.text)
201
+
202
+ return extras
203
+
204
+
205
+ def _parse_specifier(tokenizer: Tokenizer) -> str:
206
+ """
207
+ specifier = LEFT_PARENTHESIS WS? version_many WS? RIGHT_PARENTHESIS
208
+ | WS? version_many WS?
209
+ """
210
+ with tokenizer.enclosing_tokens(
211
+ "LEFT_PARENTHESIS",
212
+ "RIGHT_PARENTHESIS",
213
+ around="version specifier",
214
+ ):
215
+ tokenizer.consume("WS")
216
+ parsed_specifiers = _parse_version_many(tokenizer)
217
+ tokenizer.consume("WS")
218
+
219
+ return parsed_specifiers
220
+
221
+
222
+ def _parse_version_many(tokenizer: Tokenizer) -> str:
223
+ """
224
+ version_many = (SPECIFIER (WS? COMMA WS? SPECIFIER)*)?
225
+ """
226
+ parsed_specifiers = ""
227
+ while tokenizer.check("SPECIFIER"):
228
+ span_start = tokenizer.position
229
+ parsed_specifiers += tokenizer.read().text
230
+ if tokenizer.check("VERSION_PREFIX_TRAIL", peek=True):
231
+ tokenizer.raise_syntax_error(
232
+ ".* suffix can only be used with `==` or `!=` operators",
233
+ span_start=span_start,
234
+ span_end=tokenizer.position + 1,
235
+ )
236
+ if tokenizer.check("VERSION_LOCAL_LABEL_TRAIL", peek=True):
237
+ tokenizer.raise_syntax_error(
238
+ "Local version label can only be used with `==` or `!=` operators",
239
+ span_start=span_start,
240
+ span_end=tokenizer.position,
241
+ )
242
+ tokenizer.consume("WS")
243
+ if not tokenizer.check("COMMA"):
244
+ break
245
+ parsed_specifiers += tokenizer.read().text
246
+ tokenizer.consume("WS")
247
+
248
+ return parsed_specifiers
249
+
250
+
251
+ # --------------------------------------------------------------------------------------
252
+ # Recursive descent parser for marker expression
253
+ # --------------------------------------------------------------------------------------
254
+ def parse_marker(source: str) -> MarkerList:
255
+ return _parse_full_marker(Tokenizer(source, rules=DEFAULT_RULES))
256
+
257
+
258
+ def _parse_full_marker(tokenizer: Tokenizer) -> MarkerList:
259
+ retval = _parse_marker(tokenizer)
260
+ tokenizer.expect("END", expected="end of marker expression")
261
+ return retval
262
+
263
+
264
+ def _parse_marker(tokenizer: Tokenizer) -> MarkerList:
265
+ """
266
+ marker = marker_atom (BOOLOP marker_atom)+
267
+ """
268
+ expression = [_parse_marker_atom(tokenizer)]
269
+ while tokenizer.check("BOOLOP"):
270
+ token = tokenizer.read()
271
+ expr_right = _parse_marker_atom(tokenizer)
272
+ expression.extend((token.text, expr_right))
273
+ return expression
274
+
275
+
276
+ def _parse_marker_atom(tokenizer: Tokenizer) -> MarkerAtom:
277
+ """
278
+ marker_atom = WS? LEFT_PARENTHESIS WS? marker WS? RIGHT_PARENTHESIS WS?
279
+ | WS? marker_item WS?
280
+ """
281
+
282
+ tokenizer.consume("WS")
283
+ if tokenizer.check("LEFT_PARENTHESIS", peek=True):
284
+ with tokenizer.enclosing_tokens(
285
+ "LEFT_PARENTHESIS",
286
+ "RIGHT_PARENTHESIS",
287
+ around="marker expression",
288
+ ):
289
+ tokenizer.consume("WS")
290
+ marker: MarkerAtom = _parse_marker(tokenizer)
291
+ tokenizer.consume("WS")
292
+ else:
293
+ marker = _parse_marker_item(tokenizer)
294
+ tokenizer.consume("WS")
295
+ return marker
296
+
297
+
298
+ def _parse_marker_item(tokenizer: Tokenizer) -> MarkerItem:
299
+ """
300
+ marker_item = WS? marker_var WS? marker_op WS? marker_var WS?
301
+ """
302
+ tokenizer.consume("WS")
303
+ marker_var_left = _parse_marker_var(tokenizer)
304
+ tokenizer.consume("WS")
305
+ marker_op = _parse_marker_op(tokenizer)
306
+ tokenizer.consume("WS")
307
+ marker_var_right = _parse_marker_var(tokenizer)
308
+ tokenizer.consume("WS")
309
+ return (marker_var_left, marker_op, marker_var_right)
310
+
311
+
312
+ def _parse_marker_var(tokenizer: Tokenizer) -> MarkerVar:
313
+ """
314
+ marker_var = VARIABLE | QUOTED_STRING
315
+ """
316
+ if tokenizer.check("VARIABLE"):
317
+ return process_env_var(tokenizer.read().text.replace(".", "_"))
318
+ elif tokenizer.check("QUOTED_STRING"):
319
+ return process_python_str(tokenizer.read().text)
320
+ else:
321
+ tokenizer.raise_syntax_error(
322
+ message="Expected a marker variable or quoted string"
323
+ )
324
+
325
+
326
+ def process_env_var(env_var: str) -> Variable:
327
+ if env_var in ("platform_python_implementation", "python_implementation"):
328
+ return Variable("platform_python_implementation")
329
+ else:
330
+ return Variable(env_var)
331
+
332
+
333
+ def process_python_str(python_str: str) -> Value:
334
+ value = ast.literal_eval(python_str)
335
+ return Value(str(value))
336
+
337
+
338
+ def _parse_marker_op(tokenizer: Tokenizer) -> Op:
339
+ """
340
+ marker_op = IN | NOT IN | OP
341
+ """
342
+ if tokenizer.check("IN"):
343
+ tokenizer.read()
344
+ return Op("in")
345
+ elif tokenizer.check("NOT"):
346
+ tokenizer.read()
347
+ tokenizer.expect("WS", expected="whitespace after 'not'")
348
+ tokenizer.expect("IN", expected="'in' after 'not'")
349
+ return Op("not in")
350
+ elif tokenizer.check("OP"):
351
+ return Op(tokenizer.read().text)
352
+ else:
353
+ return tokenizer.raise_syntax_error(
354
+ "Expected marker operator, one of "
355
+ "<=, <, !=, ==, >=, >, ~=, ===, in, not in"
356
+ )
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_structures.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is dual licensed under the terms of the Apache License, Version
2
+ # 2.0, and the BSD License. See the LICENSE file in the root of this repository
3
+ # for complete details.
4
+
5
+
6
+ class InfinityType:
7
+ def __repr__(self) -> str:
8
+ return "Infinity"
9
+
10
+ def __hash__(self) -> int:
11
+ return hash(repr(self))
12
+
13
+ def __lt__(self, other: object) -> bool:
14
+ return False
15
+
16
+ def __le__(self, other: object) -> bool:
17
+ return False
18
+
19
+ def __eq__(self, other: object) -> bool:
20
+ return isinstance(other, self.__class__)
21
+
22
+ def __gt__(self, other: object) -> bool:
23
+ return True
24
+
25
+ def __ge__(self, other: object) -> bool:
26
+ return True
27
+
28
+ def __neg__(self: object) -> "NegativeInfinityType":
29
+ return NegativeInfinity
30
+
31
+
32
+ Infinity = InfinityType()
33
+
34
+
35
+ class NegativeInfinityType:
36
+ def __repr__(self) -> str:
37
+ return "-Infinity"
38
+
39
+ def __hash__(self) -> int:
40
+ return hash(repr(self))
41
+
42
+ def __lt__(self, other: object) -> bool:
43
+ return True
44
+
45
+ def __le__(self, other: object) -> bool:
46
+ return True
47
+
48
+ def __eq__(self, other: object) -> bool:
49
+ return isinstance(other, self.__class__)
50
+
51
+ def __gt__(self, other: object) -> bool:
52
+ return False
53
+
54
+ def __ge__(self, other: object) -> bool:
55
+ return False
56
+
57
+ def __neg__(self: object) -> InfinityType:
58
+ return Infinity
59
+
60
+
61
+ NegativeInfinity = NegativeInfinityType()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/_tokenizer.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import contextlib
2
+ import re
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Iterator, NoReturn, Optional, Tuple, Union
5
+
6
+ from .specifiers import Specifier
7
+
8
+
9
+ @dataclass
10
+ class Token:
11
+ name: str
12
+ text: str
13
+ position: int
14
+
15
+
16
+ class ParserSyntaxError(Exception):
17
+ """The provided source text could not be parsed correctly."""
18
+
19
+ def __init__(
20
+ self,
21
+ message: str,
22
+ *,
23
+ source: str,
24
+ span: Tuple[int, int],
25
+ ) -> None:
26
+ self.span = span
27
+ self.message = message
28
+ self.source = source
29
+
30
+ super().__init__()
31
+
32
+ def __str__(self) -> str:
33
+ marker = " " * self.span[0] + "~" * (self.span[1] - self.span[0]) + "^"
34
+ return "\n ".join([self.message, self.source, marker])
35
+
36
+
37
+ DEFAULT_RULES: "Dict[str, Union[str, re.Pattern[str]]]" = {
38
+ "LEFT_PARENTHESIS": r"\(",
39
+ "RIGHT_PARENTHESIS": r"\)",
40
+ "LEFT_BRACKET": r"\[",
41
+ "RIGHT_BRACKET": r"\]",
42
+ "SEMICOLON": r";",
43
+ "COMMA": r",",
44
+ "QUOTED_STRING": re.compile(
45
+ r"""
46
+ (
47
+ ('[^']*')
48
+ |
49
+ ("[^"]*")
50
+ )
51
+ """,
52
+ re.VERBOSE,
53
+ ),
54
+ "OP": r"(===|==|~=|!=|<=|>=|<|>)",
55
+ "BOOLOP": r"\b(or|and)\b",
56
+ "IN": r"\bin\b",
57
+ "NOT": r"\bnot\b",
58
+ "VARIABLE": re.compile(
59
+ r"""
60
+ \b(
61
+ python_version
62
+ |python_full_version
63
+ |os[._]name
64
+ |sys[._]platform
65
+ |platform_(release|system)
66
+ |platform[._](version|machine|python_implementation)
67
+ |python_implementation
68
+ |implementation_(name|version)
69
+ |extra
70
+ )\b
71
+ """,
72
+ re.VERBOSE,
73
+ ),
74
+ "SPECIFIER": re.compile(
75
+ Specifier._operator_regex_str + Specifier._version_regex_str,
76
+ re.VERBOSE | re.IGNORECASE,
77
+ ),
78
+ "AT": r"\@",
79
+ "URL": r"[^ \t]+",
80
+ "IDENTIFIER": r"\b[a-zA-Z0-9][a-zA-Z0-9._-]*\b",
81
+ "VERSION_PREFIX_TRAIL": r"\.\*",
82
+ "VERSION_LOCAL_LABEL_TRAIL": r"\+[a-z0-9]+(?:[-_\.][a-z0-9]+)*",
83
+ "WS": r"[ \t]+",
84
+ "END": r"$",
85
+ }
86
+
87
+
88
+ class Tokenizer:
89
+ """Context-sensitive token parsing.
90
+
91
+ Provides methods to examine the input stream to check whether the next token
92
+ matches.
93
+ """
94
+
95
+ def __init__(
96
+ self,
97
+ source: str,
98
+ *,
99
+ rules: "Dict[str, Union[str, re.Pattern[str]]]",
100
+ ) -> None:
101
+ self.source = source
102
+ self.rules: Dict[str, re.Pattern[str]] = {
103
+ name: re.compile(pattern) for name, pattern in rules.items()
104
+ }
105
+ self.next_token: Optional[Token] = None
106
+ self.position = 0
107
+
108
+ def consume(self, name: str) -> None:
109
+ """Move beyond provided token name, if at current position."""
110
+ if self.check(name):
111
+ self.read()
112
+
113
+ def check(self, name: str, *, peek: bool = False) -> bool:
114
+ """Check whether the next token has the provided name.
115
+
116
+ By default, if the check succeeds, the token *must* be read before
117
+ another check. If `peek` is set to `True`, the token is not loaded and
118
+ would need to be checked again.
119
+ """
120
+ assert (
121
+ self.next_token is None
122
+ ), f"Cannot check for {name!r}, already have {self.next_token!r}"
123
+ assert name in self.rules, f"Unknown token name: {name!r}"
124
+
125
+ expression = self.rules[name]
126
+
127
+ match = expression.match(self.source, self.position)
128
+ if match is None:
129
+ return False
130
+ if not peek:
131
+ self.next_token = Token(name, match[0], self.position)
132
+ return True
133
+
134
+ def expect(self, name: str, *, expected: str) -> Token:
135
+ """Expect a certain token name next, failing with a syntax error otherwise.
136
+
137
+ The token is *not* read.
138
+ """
139
+ if not self.check(name):
140
+ raise self.raise_syntax_error(f"Expected {expected}")
141
+ return self.read()
142
+
143
+ def read(self) -> Token:
144
+ """Consume the next token and return it."""
145
+ token = self.next_token
146
+ assert token is not None
147
+
148
+ self.position += len(token.text)
149
+ self.next_token = None
150
+
151
+ return token
152
+
153
+ def raise_syntax_error(
154
+ self,
155
+ message: str,
156
+ *,
157
+ span_start: Optional[int] = None,
158
+ span_end: Optional[int] = None,
159
+ ) -> NoReturn:
160
+ """Raise ParserSyntaxError at the given position."""
161
+ span = (
162
+ self.position if span_start is None else span_start,
163
+ self.position if span_end is None else span_end,
164
+ )
165
+ raise ParserSyntaxError(
166
+ message,
167
+ source=self.source,
168
+ span=span,
169
+ )
170
+
171
+ @contextlib.contextmanager
172
+ def enclosing_tokens(
173
+ self, open_token: str, close_token: str, *, around: str
174
+ ) -> Iterator[None]:
175
+ if self.check(open_token):
176
+ open_position = self.position
177
+ self.read()
178
+ else:
179
+ open_position = None
180
+
181
+ yield
182
+
183
+ if open_position is None:
184
+ return
185
+
186
+ if not self.check(close_token):
187
+ self.raise_syntax_error(
188
+ f"Expected matching {close_token} for {open_token}, after {around}",
189
+ span_start=open_position,
190
+ )
191
+
192
+ self.read()
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/markers.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is dual licensed under the terms of the Apache License, Version
2
+ # 2.0, and the BSD License. See the LICENSE file in the root of this repository
3
+ # for complete details.
4
+
5
+ import operator
6
+ import os
7
+ import platform
8
+ import sys
9
+ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
10
+
11
+ from ._parser import (
12
+ MarkerAtom,
13
+ MarkerList,
14
+ Op,
15
+ Value,
16
+ Variable,
17
+ )
18
+ from ._parser import (
19
+ parse_marker as _parse_marker,
20
+ )
21
+ from ._tokenizer import ParserSyntaxError
22
+ from .specifiers import InvalidSpecifier, Specifier
23
+ from .utils import canonicalize_name
24
+
25
+ __all__ = [
26
+ "InvalidMarker",
27
+ "UndefinedComparison",
28
+ "UndefinedEnvironmentName",
29
+ "Marker",
30
+ "default_environment",
31
+ ]
32
+
33
+ Operator = Callable[[str, str], bool]
34
+
35
+
36
+ class InvalidMarker(ValueError):
37
+ """
38
+ An invalid marker was found, users should refer to PEP 508.
39
+ """
40
+
41
+
42
+ class UndefinedComparison(ValueError):
43
+ """
44
+ An invalid operation was attempted on a value that doesn't support it.
45
+ """
46
+
47
+
48
+ class UndefinedEnvironmentName(ValueError):
49
+ """
50
+ A name was attempted to be used that does not exist inside of the
51
+ environment.
52
+ """
53
+
54
+
55
+ def _normalize_extra_values(results: Any) -> Any:
56
+ """
57
+ Normalize extra values.
58
+ """
59
+ if isinstance(results[0], tuple):
60
+ lhs, op, rhs = results[0]
61
+ if isinstance(lhs, Variable) and lhs.value == "extra":
62
+ normalized_extra = canonicalize_name(rhs.value)
63
+ rhs = Value(normalized_extra)
64
+ elif isinstance(rhs, Variable) and rhs.value == "extra":
65
+ normalized_extra = canonicalize_name(lhs.value)
66
+ lhs = Value(normalized_extra)
67
+ results[0] = lhs, op, rhs
68
+ return results
69
+
70
+
71
+ def _format_marker(
72
+ marker: Union[List[str], MarkerAtom, str], first: Optional[bool] = True
73
+ ) -> str:
74
+ assert isinstance(marker, (list, tuple, str))
75
+
76
+ # Sometimes we have a structure like [[...]] which is a single item list
77
+ # where the single item is itself it's own list. In that case we want skip
78
+ # the rest of this function so that we don't get extraneous () on the
79
+ # outside.
80
+ if (
81
+ isinstance(marker, list)
82
+ and len(marker) == 1
83
+ and isinstance(marker[0], (list, tuple))
84
+ ):
85
+ return _format_marker(marker[0])
86
+
87
+ if isinstance(marker, list):
88
+ inner = (_format_marker(m, first=False) for m in marker)
89
+ if first:
90
+ return " ".join(inner)
91
+ else:
92
+ return "(" + " ".join(inner) + ")"
93
+ elif isinstance(marker, tuple):
94
+ return " ".join([m.serialize() for m in marker])
95
+ else:
96
+ return marker
97
+
98
+
99
+ _operators: Dict[str, Operator] = {
100
+ "in": lambda lhs, rhs: lhs in rhs,
101
+ "not in": lambda lhs, rhs: lhs not in rhs,
102
+ "<": operator.lt,
103
+ "<=": operator.le,
104
+ "==": operator.eq,
105
+ "!=": operator.ne,
106
+ ">=": operator.ge,
107
+ ">": operator.gt,
108
+ }
109
+
110
+
111
+ def _eval_op(lhs: str, op: Op, rhs: str) -> bool:
112
+ try:
113
+ spec = Specifier("".join([op.serialize(), rhs]))
114
+ except InvalidSpecifier:
115
+ pass
116
+ else:
117
+ return spec.contains(lhs, prereleases=True)
118
+
119
+ oper: Optional[Operator] = _operators.get(op.serialize())
120
+ if oper is None:
121
+ raise UndefinedComparison(f"Undefined {op!r} on {lhs!r} and {rhs!r}.")
122
+
123
+ return oper(lhs, rhs)
124
+
125
+
126
+ def _normalize(*values: str, key: str) -> Tuple[str, ...]:
127
+ # PEP 685 – Comparison of extra names for optional distribution dependencies
128
+ # https://peps.python.org/pep-0685/
129
+ # > When comparing extra names, tools MUST normalize the names being
130
+ # > compared using the semantics outlined in PEP 503 for names
131
+ if key == "extra":
132
+ return tuple(canonicalize_name(v) for v in values)
133
+
134
+ # other environment markers don't have such standards
135
+ return values
136
+
137
+
138
+ def _evaluate_markers(markers: MarkerList, environment: Dict[str, str]) -> bool:
139
+ groups: List[List[bool]] = [[]]
140
+
141
+ for marker in markers:
142
+ assert isinstance(marker, (list, tuple, str))
143
+
144
+ if isinstance(marker, list):
145
+ groups[-1].append(_evaluate_markers(marker, environment))
146
+ elif isinstance(marker, tuple):
147
+ lhs, op, rhs = marker
148
+
149
+ if isinstance(lhs, Variable):
150
+ environment_key = lhs.value
151
+ lhs_value = environment[environment_key]
152
+ rhs_value = rhs.value
153
+ else:
154
+ lhs_value = lhs.value
155
+ environment_key = rhs.value
156
+ rhs_value = environment[environment_key]
157
+
158
+ lhs_value, rhs_value = _normalize(lhs_value, rhs_value, key=environment_key)
159
+ groups[-1].append(_eval_op(lhs_value, op, rhs_value))
160
+ else:
161
+ assert marker in ["and", "or"]
162
+ if marker == "or":
163
+ groups.append([])
164
+
165
+ return any(all(item) for item in groups)
166
+
167
+
168
+ def format_full_version(info: "sys._version_info") -> str:
169
+ version = "{0.major}.{0.minor}.{0.micro}".format(info)
170
+ kind = info.releaselevel
171
+ if kind != "final":
172
+ version += kind[0] + str(info.serial)
173
+ return version
174
+
175
+
176
+ def default_environment() -> Dict[str, str]:
177
+ iver = format_full_version(sys.implementation.version)
178
+ implementation_name = sys.implementation.name
179
+ return {
180
+ "implementation_name": implementation_name,
181
+ "implementation_version": iver,
182
+ "os_name": os.name,
183
+ "platform_machine": platform.machine(),
184
+ "platform_release": platform.release(),
185
+ "platform_system": platform.system(),
186
+ "platform_version": platform.version(),
187
+ "python_full_version": platform.python_version(),
188
+ "platform_python_implementation": platform.python_implementation(),
189
+ "python_version": ".".join(platform.python_version_tuple()[:2]),
190
+ "sys_platform": sys.platform,
191
+ }
192
+
193
+
194
+ class Marker:
195
+ def __init__(self, marker: str) -> None:
196
+ # Note: We create a Marker object without calling this constructor in
197
+ # packaging.requirements.Requirement. If any additional logic is
198
+ # added here, make sure to mirror/adapt Requirement.
199
+ try:
200
+ self._markers = _normalize_extra_values(_parse_marker(marker))
201
+ # The attribute `_markers` can be described in terms of a recursive type:
202
+ # MarkerList = List[Union[Tuple[Node, ...], str, MarkerList]]
203
+ #
204
+ # For example, the following expression:
205
+ # python_version > "3.6" or (python_version == "3.6" and os_name == "unix")
206
+ #
207
+ # is parsed into:
208
+ # [
209
+ # (<Variable('python_version')>, <Op('>')>, <Value('3.6')>),
210
+ # 'and',
211
+ # [
212
+ # (<Variable('python_version')>, <Op('==')>, <Value('3.6')>),
213
+ # 'or',
214
+ # (<Variable('os_name')>, <Op('==')>, <Value('unix')>)
215
+ # ]
216
+ # ]
217
+ except ParserSyntaxError as e:
218
+ raise InvalidMarker(str(e)) from e
219
+
220
+ def __str__(self) -> str:
221
+ return _format_marker(self._markers)
222
+
223
+ def __repr__(self) -> str:
224
+ return f"<Marker('{self}')>"
225
+
226
+ def __hash__(self) -> int:
227
+ return hash((self.__class__.__name__, str(self)))
228
+
229
+ def __eq__(self, other: Any) -> bool:
230
+ if not isinstance(other, Marker):
231
+ return NotImplemented
232
+
233
+ return str(self) == str(other)
234
+
235
+ def evaluate(self, environment: Optional[Dict[str, str]] = None) -> bool:
236
+ """Evaluate a marker.
237
+
238
+ Return the boolean from evaluating the given marker against the
239
+ environment. environment is an optional argument to override all or
240
+ part of the determined environment.
241
+
242
+ The environment is determined from the current Python process.
243
+ """
244
+ current_environment = default_environment()
245
+ current_environment["extra"] = ""
246
+ if environment is not None:
247
+ current_environment.update(environment)
248
+ # The API used to allow setting extra to None. We need to handle this
249
+ # case for backwards compatibility.
250
+ if current_environment["extra"] is None:
251
+ current_environment["extra"] = ""
252
+
253
+ return _evaluate_markers(self._markers, current_environment)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/specifiers.py ADDED
@@ -0,0 +1,1011 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is dual licensed under the terms of the Apache License, Version
2
+ # 2.0, and the BSD License. See the LICENSE file in the root of this repository
3
+ # for complete details.
4
+ """
5
+ .. testsetup::
6
+
7
+ from packaging.specifiers import Specifier, SpecifierSet, InvalidSpecifier
8
+ from packaging.version import Version
9
+ """
10
+
11
+ import abc
12
+ import itertools
13
+ import re
14
+ from typing import Callable, Iterable, Iterator, List, Optional, Tuple, TypeVar, Union
15
+
16
+ from .utils import canonicalize_version
17
+ from .version import Version
18
+
19
+ UnparsedVersion = Union[Version, str]
20
+ UnparsedVersionVar = TypeVar("UnparsedVersionVar", bound=UnparsedVersion)
21
+ CallableOperator = Callable[[Version, str], bool]
22
+
23
+
24
+ def _coerce_version(version: UnparsedVersion) -> Version:
25
+ if not isinstance(version, Version):
26
+ version = Version(version)
27
+ return version
28
+
29
+
30
+ class InvalidSpecifier(ValueError):
31
+ """
32
+ Raised when attempting to create a :class:`Specifier` with a specifier
33
+ string that is invalid.
34
+
35
+ >>> Specifier("lolwat")
36
+ Traceback (most recent call last):
37
+ ...
38
+ packaging.specifiers.InvalidSpecifier: Invalid specifier: 'lolwat'
39
+ """
40
+
41
+
42
+ class BaseSpecifier(metaclass=abc.ABCMeta):
43
+ @abc.abstractmethod
44
+ def __str__(self) -> str:
45
+ """
46
+ Returns the str representation of this Specifier-like object. This
47
+ should be representative of the Specifier itself.
48
+ """
49
+
50
+ @abc.abstractmethod
51
+ def __hash__(self) -> int:
52
+ """
53
+ Returns a hash value for this Specifier-like object.
54
+ """
55
+
56
+ @abc.abstractmethod
57
+ def __eq__(self, other: object) -> bool:
58
+ """
59
+ Returns a boolean representing whether or not the two Specifier-like
60
+ objects are equal.
61
+
62
+ :param other: The other object to check against.
63
+ """
64
+
65
+ @property
66
+ @abc.abstractmethod
67
+ def prereleases(self) -> Optional[bool]:
68
+ """Whether or not pre-releases as a whole are allowed.
69
+
70
+ This can be set to either ``True`` or ``False`` to explicitly enable or disable
71
+ prereleases or it can be set to ``None`` (the default) to use default semantics.
72
+ """
73
+
74
+ @prereleases.setter
75
+ def prereleases(self, value: bool) -> None:
76
+ """Setter for :attr:`prereleases`.
77
+
78
+ :param value: The value to set.
79
+ """
80
+
81
+ @abc.abstractmethod
82
+ def contains(self, item: str, prereleases: Optional[bool] = None) -> bool:
83
+ """
84
+ Determines if the given item is contained within this specifier.
85
+ """
86
+
87
+ @abc.abstractmethod
88
+ def filter(
89
+ self, iterable: Iterable[UnparsedVersionVar], prereleases: Optional[bool] = None
90
+ ) -> Iterator[UnparsedVersionVar]:
91
+ """
92
+ Takes an iterable of items and filters them so that only items which
93
+ are contained within this specifier are allowed in it.
94
+ """
95
+
96
+
97
+ class Specifier(BaseSpecifier):
98
+ """This class abstracts handling of version specifiers.
99
+
100
+ .. tip::
101
+
102
+ It is generally not required to instantiate this manually. You should instead
103
+ prefer to work with :class:`SpecifierSet` instead, which can parse
104
+ comma-separated version specifiers (which is what package metadata contains).
105
+ """
106
+
107
+ _operator_regex_str = r"""
108
+ (?P<operator>(~=|==|!=|<=|>=|<|>|===))
109
+ """
110
+ _version_regex_str = r"""
111
+ (?P<version>
112
+ (?:
113
+ # The identity operators allow for an escape hatch that will
114
+ # do an exact string match of the version you wish to install.
115
+ # This will not be parsed by PEP 440 and we cannot determine
116
+ # any semantic meaning from it. This operator is discouraged
117
+ # but included entirely as an escape hatch.
118
+ (?<====) # Only match for the identity operator
119
+ \s*
120
+ [^\s;)]* # The arbitrary version can be just about anything,
121
+ # we match everything except for whitespace, a
122
+ # semi-colon for marker support, and a closing paren
123
+ # since versions can be enclosed in them.
124
+ )
125
+ |
126
+ (?:
127
+ # The (non)equality operators allow for wild card and local
128
+ # versions to be specified so we have to define these two
129
+ # operators separately to enable that.
130
+ (?<===|!=) # Only match for equals and not equals
131
+
132
+ \s*
133
+ v?
134
+ (?:[0-9]+!)? # epoch
135
+ [0-9]+(?:\.[0-9]+)* # release
136
+
137
+ # You cannot use a wild card and a pre-release, post-release, a dev or
138
+ # local version together so group them with a | and make them optional.
139
+ (?:
140
+ \.\* # Wild card syntax of .*
141
+ |
142
+ (?: # pre release
143
+ [-_\.]?
144
+ (alpha|beta|preview|pre|a|b|c|rc)
145
+ [-_\.]?
146
+ [0-9]*
147
+ )?
148
+ (?: # post release
149
+ (?:-[0-9]+)|(?:[-_\.]?(post|rev|r)[-_\.]?[0-9]*)
150
+ )?
151
+ (?:[-_\.]?dev[-_\.]?[0-9]*)? # dev release
152
+ (?:\+[a-z0-9]+(?:[-_\.][a-z0-9]+)*)? # local
153
+ )?
154
+ )
155
+ |
156
+ (?:
157
+ # The compatible operator requires at least two digits in the
158
+ # release segment.
159
+ (?<=~=) # Only match for the compatible operator
160
+
161
+ \s*
162
+ v?
163
+ (?:[0-9]+!)? # epoch
164
+ [0-9]+(?:\.[0-9]+)+ # release (We have a + instead of a *)
165
+ (?: # pre release
166
+ [-_\.]?
167
+ (alpha|beta|preview|pre|a|b|c|rc)
168
+ [-_\.]?
169
+ [0-9]*
170
+ )?
171
+ (?: # post release
172
+ (?:-[0-9]+)|(?:[-_\.]?(post|rev|r)[-_\.]?[0-9]*)
173
+ )?
174
+ (?:[-_\.]?dev[-_\.]?[0-9]*)? # dev release
175
+ )
176
+ |
177
+ (?:
178
+ # All other operators only allow a sub set of what the
179
+ # (non)equality operators do. Specifically they do not allow
180
+ # local versions to be specified nor do they allow the prefix
181
+ # matching wild cards.
182
+ (?<!==|!=|~=) # We have special cases for these
183
+ # operators so we want to make sure they
184
+ # don't match here.
185
+
186
+ \s*
187
+ v?
188
+ (?:[0-9]+!)? # epoch
189
+ [0-9]+(?:\.[0-9]+)* # release
190
+ (?: # pre release
191
+ [-_\.]?
192
+ (alpha|beta|preview|pre|a|b|c|rc)
193
+ [-_\.]?
194
+ [0-9]*
195
+ )?
196
+ (?: # post release
197
+ (?:-[0-9]+)|(?:[-_\.]?(post|rev|r)[-_\.]?[0-9]*)
198
+ )?
199
+ (?:[-_\.]?dev[-_\.]?[0-9]*)? # dev release
200
+ )
201
+ )
202
+ """
203
+
204
+ _regex = re.compile(
205
+ r"^\s*" + _operator_regex_str + _version_regex_str + r"\s*$",
206
+ re.VERBOSE | re.IGNORECASE,
207
+ )
208
+
209
+ _operators = {
210
+ "~=": "compatible",
211
+ "==": "equal",
212
+ "!=": "not_equal",
213
+ "<=": "less_than_equal",
214
+ ">=": "greater_than_equal",
215
+ "<": "less_than",
216
+ ">": "greater_than",
217
+ "===": "arbitrary",
218
+ }
219
+
220
+ def __init__(self, spec: str = "", prereleases: Optional[bool] = None) -> None:
221
+ """Initialize a Specifier instance.
222
+
223
+ :param spec:
224
+ The string representation of a specifier which will be parsed and
225
+ normalized before use.
226
+ :param prereleases:
227
+ This tells the specifier if it should accept prerelease versions if
228
+ applicable or not. The default of ``None`` will autodetect it from the
229
+ given specifiers.
230
+ :raises InvalidSpecifier:
231
+ If the given specifier is invalid (i.e. bad syntax).
232
+ """
233
+ match = self._regex.search(spec)
234
+ if not match:
235
+ raise InvalidSpecifier(f"Invalid specifier: '{spec}'")
236
+
237
+ self._spec: Tuple[str, str] = (
238
+ match.group("operator").strip(),
239
+ match.group("version").strip(),
240
+ )
241
+
242
+ # Store whether or not this Specifier should accept prereleases
243
+ self._prereleases = prereleases
244
+
245
+ # https://github.com/python/mypy/pull/13475#pullrequestreview-1079784515
246
+ @property # type: ignore[override]
247
+ def prereleases(self) -> bool:
248
+ # If there is an explicit prereleases set for this, then we'll just
249
+ # blindly use that.
250
+ if self._prereleases is not None:
251
+ return self._prereleases
252
+
253
+ # Look at all of our specifiers and determine if they are inclusive
254
+ # operators, and if they are if they are including an explicit
255
+ # prerelease.
256
+ operator, version = self._spec
257
+ if operator in ["==", ">=", "<=", "~=", "==="]:
258
+ # The == specifier can include a trailing .*, if it does we
259
+ # want to remove before parsing.
260
+ if operator == "==" and version.endswith(".*"):
261
+ version = version[:-2]
262
+
263
+ # Parse the version, and if it is a pre-release than this
264
+ # specifier allows pre-releases.
265
+ if Version(version).is_prerelease:
266
+ return True
267
+
268
+ return False
269
+
270
+ @prereleases.setter
271
+ def prereleases(self, value: bool) -> None:
272
+ self._prereleases = value
273
+
274
+ @property
275
+ def operator(self) -> str:
276
+ """The operator of this specifier.
277
+
278
+ >>> Specifier("==1.2.3").operator
279
+ '=='
280
+ """
281
+ return self._spec[0]
282
+
283
+ @property
284
+ def version(self) -> str:
285
+ """The version of this specifier.
286
+
287
+ >>> Specifier("==1.2.3").version
288
+ '1.2.3'
289
+ """
290
+ return self._spec[1]
291
+
292
+ def __repr__(self) -> str:
293
+ """A representation of the Specifier that shows all internal state.
294
+
295
+ >>> Specifier('>=1.0.0')
296
+ <Specifier('>=1.0.0')>
297
+ >>> Specifier('>=1.0.0', prereleases=False)
298
+ <Specifier('>=1.0.0', prereleases=False)>
299
+ >>> Specifier('>=1.0.0', prereleases=True)
300
+ <Specifier('>=1.0.0', prereleases=True)>
301
+ """
302
+ pre = (
303
+ f", prereleases={self.prereleases!r}"
304
+ if self._prereleases is not None
305
+ else ""
306
+ )
307
+
308
+ return f"<{self.__class__.__name__}({str(self)!r}{pre})>"
309
+
310
+ def __str__(self) -> str:
311
+ """A string representation of the Specifier that can be round-tripped.
312
+
313
+ >>> str(Specifier('>=1.0.0'))
314
+ '>=1.0.0'
315
+ >>> str(Specifier('>=1.0.0', prereleases=False))
316
+ '>=1.0.0'
317
+ """
318
+ return "{}{}".format(*self._spec)
319
+
320
+ @property
321
+ def _canonical_spec(self) -> Tuple[str, str]:
322
+ canonical_version = canonicalize_version(
323
+ self._spec[1],
324
+ strip_trailing_zero=(self._spec[0] != "~="),
325
+ )
326
+ return self._spec[0], canonical_version
327
+
328
+ def __hash__(self) -> int:
329
+ return hash(self._canonical_spec)
330
+
331
+ def __eq__(self, other: object) -> bool:
332
+ """Whether or not the two Specifier-like objects are equal.
333
+
334
+ :param other: The other object to check against.
335
+
336
+ The value of :attr:`prereleases` is ignored.
337
+
338
+ >>> Specifier("==1.2.3") == Specifier("== 1.2.3.0")
339
+ True
340
+ >>> (Specifier("==1.2.3", prereleases=False) ==
341
+ ... Specifier("==1.2.3", prereleases=True))
342
+ True
343
+ >>> Specifier("==1.2.3") == "==1.2.3"
344
+ True
345
+ >>> Specifier("==1.2.3") == Specifier("==1.2.4")
346
+ False
347
+ >>> Specifier("==1.2.3") == Specifier("~=1.2.3")
348
+ False
349
+ """
350
+ if isinstance(other, str):
351
+ try:
352
+ other = self.__class__(str(other))
353
+ except InvalidSpecifier:
354
+ return NotImplemented
355
+ elif not isinstance(other, self.__class__):
356
+ return NotImplemented
357
+
358
+ return self._canonical_spec == other._canonical_spec
359
+
360
+ def _get_operator(self, op: str) -> CallableOperator:
361
+ operator_callable: CallableOperator = getattr(
362
+ self, f"_compare_{self._operators[op]}"
363
+ )
364
+ return operator_callable
365
+
366
+ def _compare_compatible(self, prospective: Version, spec: str) -> bool:
367
+ # Compatible releases have an equivalent combination of >= and ==. That
368
+ # is that ~=2.2 is equivalent to >=2.2,==2.*. This allows us to
369
+ # implement this in terms of the other specifiers instead of
370
+ # implementing it ourselves. The only thing we need to do is construct
371
+ # the other specifiers.
372
+
373
+ # We want everything but the last item in the version, but we want to
374
+ # ignore suffix segments.
375
+ prefix = _version_join(
376
+ list(itertools.takewhile(_is_not_suffix, _version_split(spec)))[:-1]
377
+ )
378
+
379
+ # Add the prefix notation to the end of our string
380
+ prefix += ".*"
381
+
382
+ return self._get_operator(">=")(prospective, spec) and self._get_operator("==")(
383
+ prospective, prefix
384
+ )
385
+
386
+ def _compare_equal(self, prospective: Version, spec: str) -> bool:
387
+ # We need special logic to handle prefix matching
388
+ if spec.endswith(".*"):
389
+ # In the case of prefix matching we want to ignore local segment.
390
+ normalized_prospective = canonicalize_version(
391
+ prospective.public, strip_trailing_zero=False
392
+ )
393
+ # Get the normalized version string ignoring the trailing .*
394
+ normalized_spec = canonicalize_version(spec[:-2], strip_trailing_zero=False)
395
+ # Split the spec out by bangs and dots, and pretend that there is
396
+ # an implicit dot in between a release segment and a pre-release segment.
397
+ split_spec = _version_split(normalized_spec)
398
+
399
+ # Split the prospective version out by bangs and dots, and pretend
400
+ # that there is an implicit dot in between a release segment and
401
+ # a pre-release segment.
402
+ split_prospective = _version_split(normalized_prospective)
403
+
404
+ # 0-pad the prospective version before shortening it to get the correct
405
+ # shortened version.
406
+ padded_prospective, _ = _pad_version(split_prospective, split_spec)
407
+
408
+ # Shorten the prospective version to be the same length as the spec
409
+ # so that we can determine if the specifier is a prefix of the
410
+ # prospective version or not.
411
+ shortened_prospective = padded_prospective[: len(split_spec)]
412
+
413
+ return shortened_prospective == split_spec
414
+ else:
415
+ # Convert our spec string into a Version
416
+ spec_version = Version(spec)
417
+
418
+ # If the specifier does not have a local segment, then we want to
419
+ # act as if the prospective version also does not have a local
420
+ # segment.
421
+ if not spec_version.local:
422
+ prospective = Version(prospective.public)
423
+
424
+ return prospective == spec_version
425
+
426
+ def _compare_not_equal(self, prospective: Version, spec: str) -> bool:
427
+ return not self._compare_equal(prospective, spec)
428
+
429
+ def _compare_less_than_equal(self, prospective: Version, spec: str) -> bool:
430
+ # NB: Local version identifiers are NOT permitted in the version
431
+ # specifier, so local version labels can be universally removed from
432
+ # the prospective version.
433
+ return Version(prospective.public) <= Version(spec)
434
+
435
+ def _compare_greater_than_equal(self, prospective: Version, spec: str) -> bool:
436
+ # NB: Local version identifiers are NOT permitted in the version
437
+ # specifier, so local version labels can be universally removed from
438
+ # the prospective version.
439
+ return Version(prospective.public) >= Version(spec)
440
+
441
+ def _compare_less_than(self, prospective: Version, spec_str: str) -> bool:
442
+ # Convert our spec to a Version instance, since we'll want to work with
443
+ # it as a version.
444
+ spec = Version(spec_str)
445
+
446
+ # Check to see if the prospective version is less than the spec
447
+ # version. If it's not we can short circuit and just return False now
448
+ # instead of doing extra unneeded work.
449
+ if not prospective < spec:
450
+ return False
451
+
452
+ # This special case is here so that, unless the specifier itself
453
+ # includes is a pre-release version, that we do not accept pre-release
454
+ # versions for the version mentioned in the specifier (e.g. <3.1 should
455
+ # not match 3.1.dev0, but should match 3.0.dev0).
456
+ if not spec.is_prerelease and prospective.is_prerelease:
457
+ if Version(prospective.base_version) == Version(spec.base_version):
458
+ return False
459
+
460
+ # If we've gotten to here, it means that prospective version is both
461
+ # less than the spec version *and* it's not a pre-release of the same
462
+ # version in the spec.
463
+ return True
464
+
465
+ def _compare_greater_than(self, prospective: Version, spec_str: str) -> bool:
466
+ # Convert our spec to a Version instance, since we'll want to work with
467
+ # it as a version.
468
+ spec = Version(spec_str)
469
+
470
+ # Check to see if the prospective version is greater than the spec
471
+ # version. If it's not we can short circuit and just return False now
472
+ # instead of doing extra unneeded work.
473
+ if not prospective > spec:
474
+ return False
475
+
476
+ # This special case is here so that, unless the specifier itself
477
+ # includes is a post-release version, that we do not accept
478
+ # post-release versions for the version mentioned in the specifier
479
+ # (e.g. >3.1 should not match 3.0.post0, but should match 3.2.post0).
480
+ if not spec.is_postrelease and prospective.is_postrelease:
481
+ if Version(prospective.base_version) == Version(spec.base_version):
482
+ return False
483
+
484
+ # Ensure that we do not allow a local version of the version mentioned
485
+ # in the specifier, which is technically greater than, to match.
486
+ if prospective.local is not None:
487
+ if Version(prospective.base_version) == Version(spec.base_version):
488
+ return False
489
+
490
+ # If we've gotten to here, it means that prospective version is both
491
+ # greater than the spec version *and* it's not a pre-release of the
492
+ # same version in the spec.
493
+ return True
494
+
495
+ def _compare_arbitrary(self, prospective: Version, spec: str) -> bool:
496
+ return str(prospective).lower() == str(spec).lower()
497
+
498
+ def __contains__(self, item: Union[str, Version]) -> bool:
499
+ """Return whether or not the item is contained in this specifier.
500
+
501
+ :param item: The item to check for.
502
+
503
+ This is used for the ``in`` operator and behaves the same as
504
+ :meth:`contains` with no ``prereleases`` argument passed.
505
+
506
+ >>> "1.2.3" in Specifier(">=1.2.3")
507
+ True
508
+ >>> Version("1.2.3") in Specifier(">=1.2.3")
509
+ True
510
+ >>> "1.0.0" in Specifier(">=1.2.3")
511
+ False
512
+ >>> "1.3.0a1" in Specifier(">=1.2.3")
513
+ False
514
+ >>> "1.3.0a1" in Specifier(">=1.2.3", prereleases=True)
515
+ True
516
+ """
517
+ return self.contains(item)
518
+
519
+ def contains(
520
+ self, item: UnparsedVersion, prereleases: Optional[bool] = None
521
+ ) -> bool:
522
+ """Return whether or not the item is contained in this specifier.
523
+
524
+ :param item:
525
+ The item to check for, which can be a version string or a
526
+ :class:`Version` instance.
527
+ :param prereleases:
528
+ Whether or not to match prereleases with this Specifier. If set to
529
+ ``None`` (the default), it uses :attr:`prereleases` to determine
530
+ whether or not prereleases are allowed.
531
+
532
+ >>> Specifier(">=1.2.3").contains("1.2.3")
533
+ True
534
+ >>> Specifier(">=1.2.3").contains(Version("1.2.3"))
535
+ True
536
+ >>> Specifier(">=1.2.3").contains("1.0.0")
537
+ False
538
+ >>> Specifier(">=1.2.3").contains("1.3.0a1")
539
+ False
540
+ >>> Specifier(">=1.2.3", prereleases=True).contains("1.3.0a1")
541
+ True
542
+ >>> Specifier(">=1.2.3").contains("1.3.0a1", prereleases=True)
543
+ True
544
+ """
545
+
546
+ # Determine if prereleases are to be allowed or not.
547
+ if prereleases is None:
548
+ prereleases = self.prereleases
549
+
550
+ # Normalize item to a Version, this allows us to have a shortcut for
551
+ # "2.0" in Specifier(">=2")
552
+ normalized_item = _coerce_version(item)
553
+
554
+ # Determine if we should be supporting prereleases in this specifier
555
+ # or not, if we do not support prereleases than we can short circuit
556
+ # logic if this version is a prereleases.
557
+ if normalized_item.is_prerelease and not prereleases:
558
+ return False
559
+
560
+ # Actually do the comparison to determine if this item is contained
561
+ # within this Specifier or not.
562
+ operator_callable: CallableOperator = self._get_operator(self.operator)
563
+ return operator_callable(normalized_item, self.version)
564
+
565
+ def filter(
566
+ self, iterable: Iterable[UnparsedVersionVar], prereleases: Optional[bool] = None
567
+ ) -> Iterator[UnparsedVersionVar]:
568
+ """Filter items in the given iterable, that match the specifier.
569
+
570
+ :param iterable:
571
+ An iterable that can contain version strings and :class:`Version` instances.
572
+ The items in the iterable will be filtered according to the specifier.
573
+ :param prereleases:
574
+ Whether or not to allow prereleases in the returned iterator. If set to
575
+ ``None`` (the default), it will be intelligently decide whether to allow
576
+ prereleases or not (based on the :attr:`prereleases` attribute, and
577
+ whether the only versions matching are prereleases).
578
+
579
+ This method is smarter than just ``filter(Specifier().contains, [...])``
580
+ because it implements the rule from :pep:`440` that a prerelease item
581
+ SHOULD be accepted if no other versions match the given specifier.
582
+
583
+ >>> list(Specifier(">=1.2.3").filter(["1.2", "1.3", "1.5a1"]))
584
+ ['1.3']
585
+ >>> list(Specifier(">=1.2.3").filter(["1.2", "1.2.3", "1.3", Version("1.4")]))
586
+ ['1.2.3', '1.3', <Version('1.4')>]
587
+ >>> list(Specifier(">=1.2.3").filter(["1.2", "1.5a1"]))
588
+ ['1.5a1']
589
+ >>> list(Specifier(">=1.2.3").filter(["1.3", "1.5a1"], prereleases=True))
590
+ ['1.3', '1.5a1']
591
+ >>> list(Specifier(">=1.2.3", prereleases=True).filter(["1.3", "1.5a1"]))
592
+ ['1.3', '1.5a1']
593
+ """
594
+
595
+ yielded = False
596
+ found_prereleases = []
597
+
598
+ kw = {"prereleases": prereleases if prereleases is not None else True}
599
+
600
+ # Attempt to iterate over all the values in the iterable and if any of
601
+ # them match, yield them.
602
+ for version in iterable:
603
+ parsed_version = _coerce_version(version)
604
+
605
+ if self.contains(parsed_version, **kw):
606
+ # If our version is a prerelease, and we were not set to allow
607
+ # prereleases, then we'll store it for later in case nothing
608
+ # else matches this specifier.
609
+ if parsed_version.is_prerelease and not (
610
+ prereleases or self.prereleases
611
+ ):
612
+ found_prereleases.append(version)
613
+ # Either this is not a prerelease, or we should have been
614
+ # accepting prereleases from the beginning.
615
+ else:
616
+ yielded = True
617
+ yield version
618
+
619
+ # Now that we've iterated over everything, determine if we've yielded
620
+ # any values, and if we have not and we have any prereleases stored up
621
+ # then we will go ahead and yield the prereleases.
622
+ if not yielded and found_prereleases:
623
+ for version in found_prereleases:
624
+ yield version
625
+
626
+
627
+ _prefix_regex = re.compile(r"^([0-9]+)((?:a|b|c|rc)[0-9]+)$")
628
+
629
+
630
+ def _version_split(version: str) -> List[str]:
631
+ """Split version into components.
632
+
633
+ The split components are intended for version comparison. The logic does
634
+ not attempt to retain the original version string, so joining the
635
+ components back with :func:`_version_join` may not produce the original
636
+ version string.
637
+ """
638
+ result: List[str] = []
639
+
640
+ epoch, _, rest = version.rpartition("!")
641
+ result.append(epoch or "0")
642
+
643
+ for item in rest.split("."):
644
+ match = _prefix_regex.search(item)
645
+ if match:
646
+ result.extend(match.groups())
647
+ else:
648
+ result.append(item)
649
+ return result
650
+
651
+
652
+ def _version_join(components: List[str]) -> str:
653
+ """Join split version components into a version string.
654
+
655
+ This function assumes the input came from :func:`_version_split`, where the
656
+ first component must be the epoch (either empty or numeric), and all other
657
+ components numeric.
658
+ """
659
+ epoch, *rest = components
660
+ return f"{epoch}!{'.'.join(rest)}"
661
+
662
+
663
+ def _is_not_suffix(segment: str) -> bool:
664
+ return not any(
665
+ segment.startswith(prefix) for prefix in ("dev", "a", "b", "rc", "post")
666
+ )
667
+
668
+
669
+ def _pad_version(left: List[str], right: List[str]) -> Tuple[List[str], List[str]]:
670
+ left_split, right_split = [], []
671
+
672
+ # Get the release segment of our versions
673
+ left_split.append(list(itertools.takewhile(lambda x: x.isdigit(), left)))
674
+ right_split.append(list(itertools.takewhile(lambda x: x.isdigit(), right)))
675
+
676
+ # Get the rest of our versions
677
+ left_split.append(left[len(left_split[0]) :])
678
+ right_split.append(right[len(right_split[0]) :])
679
+
680
+ # Insert our padding
681
+ left_split.insert(1, ["0"] * max(0, len(right_split[0]) - len(left_split[0])))
682
+ right_split.insert(1, ["0"] * max(0, len(left_split[0]) - len(right_split[0])))
683
+
684
+ return (
685
+ list(itertools.chain.from_iterable(left_split)),
686
+ list(itertools.chain.from_iterable(right_split)),
687
+ )
688
+
689
+
690
+ class SpecifierSet(BaseSpecifier):
691
+ """This class abstracts handling of a set of version specifiers.
692
+
693
+ It can be passed a single specifier (``>=3.0``), a comma-separated list of
694
+ specifiers (``>=3.0,!=3.1``), or no specifier at all.
695
+ """
696
+
697
+ def __init__(
698
+ self, specifiers: str = "", prereleases: Optional[bool] = None
699
+ ) -> None:
700
+ """Initialize a SpecifierSet instance.
701
+
702
+ :param specifiers:
703
+ The string representation of a specifier or a comma-separated list of
704
+ specifiers which will be parsed and normalized before use.
705
+ :param prereleases:
706
+ This tells the SpecifierSet if it should accept prerelease versions if
707
+ applicable or not. The default of ``None`` will autodetect it from the
708
+ given specifiers.
709
+
710
+ :raises InvalidSpecifier:
711
+ If the given ``specifiers`` are not parseable than this exception will be
712
+ raised.
713
+ """
714
+
715
+ # Split on `,` to break each individual specifier into it's own item, and
716
+ # strip each item to remove leading/trailing whitespace.
717
+ split_specifiers = [s.strip() for s in specifiers.split(",") if s.strip()]
718
+
719
+ # Make each individual specifier a Specifier and save in a frozen set for later.
720
+ self._specs = frozenset(map(Specifier, split_specifiers))
721
+
722
+ # Store our prereleases value so we can use it later to determine if
723
+ # we accept prereleases or not.
724
+ self._prereleases = prereleases
725
+
726
+ @property
727
+ def prereleases(self) -> Optional[bool]:
728
+ # If we have been given an explicit prerelease modifier, then we'll
729
+ # pass that through here.
730
+ if self._prereleases is not None:
731
+ return self._prereleases
732
+
733
+ # If we don't have any specifiers, and we don't have a forced value,
734
+ # then we'll just return None since we don't know if this should have
735
+ # pre-releases or not.
736
+ if not self._specs:
737
+ return None
738
+
739
+ # Otherwise we'll see if any of the given specifiers accept
740
+ # prereleases, if any of them do we'll return True, otherwise False.
741
+ return any(s.prereleases for s in self._specs)
742
+
743
+ @prereleases.setter
744
+ def prereleases(self, value: bool) -> None:
745
+ self._prereleases = value
746
+
747
+ def __repr__(self) -> str:
748
+ """A representation of the specifier set that shows all internal state.
749
+
750
+ Note that the ordering of the individual specifiers within the set may not
751
+ match the input string.
752
+
753
+ >>> SpecifierSet('>=1.0.0,!=2.0.0')
754
+ <SpecifierSet('!=2.0.0,>=1.0.0')>
755
+ >>> SpecifierSet('>=1.0.0,!=2.0.0', prereleases=False)
756
+ <SpecifierSet('!=2.0.0,>=1.0.0', prereleases=False)>
757
+ >>> SpecifierSet('>=1.0.0,!=2.0.0', prereleases=True)
758
+ <SpecifierSet('!=2.0.0,>=1.0.0', prereleases=True)>
759
+ """
760
+ pre = (
761
+ f", prereleases={self.prereleases!r}"
762
+ if self._prereleases is not None
763
+ else ""
764
+ )
765
+
766
+ return f"<SpecifierSet({str(self)!r}{pre})>"
767
+
768
+ def __str__(self) -> str:
769
+ """A string representation of the specifier set that can be round-tripped.
770
+
771
+ Note that the ordering of the individual specifiers within the set may not
772
+ match the input string.
773
+
774
+ >>> str(SpecifierSet(">=1.0.0,!=1.0.1"))
775
+ '!=1.0.1,>=1.0.0'
776
+ >>> str(SpecifierSet(">=1.0.0,!=1.0.1", prereleases=False))
777
+ '!=1.0.1,>=1.0.0'
778
+ """
779
+ return ",".join(sorted(str(s) for s in self._specs))
780
+
781
+ def __hash__(self) -> int:
782
+ return hash(self._specs)
783
+
784
+ def __and__(self, other: Union["SpecifierSet", str]) -> "SpecifierSet":
785
+ """Return a SpecifierSet which is a combination of the two sets.
786
+
787
+ :param other: The other object to combine with.
788
+
789
+ >>> SpecifierSet(">=1.0.0,!=1.0.1") & '<=2.0.0,!=2.0.1'
790
+ <SpecifierSet('!=1.0.1,!=2.0.1,<=2.0.0,>=1.0.0')>
791
+ >>> SpecifierSet(">=1.0.0,!=1.0.1") & SpecifierSet('<=2.0.0,!=2.0.1')
792
+ <SpecifierSet('!=1.0.1,!=2.0.1,<=2.0.0,>=1.0.0')>
793
+ """
794
+ if isinstance(other, str):
795
+ other = SpecifierSet(other)
796
+ elif not isinstance(other, SpecifierSet):
797
+ return NotImplemented
798
+
799
+ specifier = SpecifierSet()
800
+ specifier._specs = frozenset(self._specs | other._specs)
801
+
802
+ if self._prereleases is None and other._prereleases is not None:
803
+ specifier._prereleases = other._prereleases
804
+ elif self._prereleases is not None and other._prereleases is None:
805
+ specifier._prereleases = self._prereleases
806
+ elif self._prereleases == other._prereleases:
807
+ specifier._prereleases = self._prereleases
808
+ else:
809
+ raise ValueError(
810
+ "Cannot combine SpecifierSets with True and False prerelease "
811
+ "overrides."
812
+ )
813
+
814
+ return specifier
815
+
816
+ def __eq__(self, other: object) -> bool:
817
+ """Whether or not the two SpecifierSet-like objects are equal.
818
+
819
+ :param other: The other object to check against.
820
+
821
+ The value of :attr:`prereleases` is ignored.
822
+
823
+ >>> SpecifierSet(">=1.0.0,!=1.0.1") == SpecifierSet(">=1.0.0,!=1.0.1")
824
+ True
825
+ >>> (SpecifierSet(">=1.0.0,!=1.0.1", prereleases=False) ==
826
+ ... SpecifierSet(">=1.0.0,!=1.0.1", prereleases=True))
827
+ True
828
+ >>> SpecifierSet(">=1.0.0,!=1.0.1") == ">=1.0.0,!=1.0.1"
829
+ True
830
+ >>> SpecifierSet(">=1.0.0,!=1.0.1") == SpecifierSet(">=1.0.0")
831
+ False
832
+ >>> SpecifierSet(">=1.0.0,!=1.0.1") == SpecifierSet(">=1.0.0,!=1.0.2")
833
+ False
834
+ """
835
+ if isinstance(other, (str, Specifier)):
836
+ other = SpecifierSet(str(other))
837
+ elif not isinstance(other, SpecifierSet):
838
+ return NotImplemented
839
+
840
+ return self._specs == other._specs
841
+
842
+ def __len__(self) -> int:
843
+ """Returns the number of specifiers in this specifier set."""
844
+ return len(self._specs)
845
+
846
+ def __iter__(self) -> Iterator[Specifier]:
847
+ """
848
+ Returns an iterator over all the underlying :class:`Specifier` instances
849
+ in this specifier set.
850
+
851
+ >>> sorted(SpecifierSet(">=1.0.0,!=1.0.1"), key=str)
852
+ [<Specifier('!=1.0.1')>, <Specifier('>=1.0.0')>]
853
+ """
854
+ return iter(self._specs)
855
+
856
+ def __contains__(self, item: UnparsedVersion) -> bool:
857
+ """Return whether or not the item is contained in this specifier.
858
+
859
+ :param item: The item to check for.
860
+
861
+ This is used for the ``in`` operator and behaves the same as
862
+ :meth:`contains` with no ``prereleases`` argument passed.
863
+
864
+ >>> "1.2.3" in SpecifierSet(">=1.0.0,!=1.0.1")
865
+ True
866
+ >>> Version("1.2.3") in SpecifierSet(">=1.0.0,!=1.0.1")
867
+ True
868
+ >>> "1.0.1" in SpecifierSet(">=1.0.0,!=1.0.1")
869
+ False
870
+ >>> "1.3.0a1" in SpecifierSet(">=1.0.0,!=1.0.1")
871
+ False
872
+ >>> "1.3.0a1" in SpecifierSet(">=1.0.0,!=1.0.1", prereleases=True)
873
+ True
874
+ """
875
+ return self.contains(item)
876
+
877
+ def contains(
878
+ self,
879
+ item: UnparsedVersion,
880
+ prereleases: Optional[bool] = None,
881
+ installed: Optional[bool] = None,
882
+ ) -> bool:
883
+ """Return whether or not the item is contained in this SpecifierSet.
884
+
885
+ :param item:
886
+ The item to check for, which can be a version string or a
887
+ :class:`Version` instance.
888
+ :param prereleases:
889
+ Whether or not to match prereleases with this SpecifierSet. If set to
890
+ ``None`` (the default), it uses :attr:`prereleases` to determine
891
+ whether or not prereleases are allowed.
892
+
893
+ >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.2.3")
894
+ True
895
+ >>> SpecifierSet(">=1.0.0,!=1.0.1").contains(Version("1.2.3"))
896
+ True
897
+ >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.0.1")
898
+ False
899
+ >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.3.0a1")
900
+ False
901
+ >>> SpecifierSet(">=1.0.0,!=1.0.1", prereleases=True).contains("1.3.0a1")
902
+ True
903
+ >>> SpecifierSet(">=1.0.0,!=1.0.1").contains("1.3.0a1", prereleases=True)
904
+ True
905
+ """
906
+ # Ensure that our item is a Version instance.
907
+ if not isinstance(item, Version):
908
+ item = Version(item)
909
+
910
+ # Determine if we're forcing a prerelease or not, if we're not forcing
911
+ # one for this particular filter call, then we'll use whatever the
912
+ # SpecifierSet thinks for whether or not we should support prereleases.
913
+ if prereleases is None:
914
+ prereleases = self.prereleases
915
+
916
+ # We can determine if we're going to allow pre-releases by looking to
917
+ # see if any of the underlying items supports them. If none of them do
918
+ # and this item is a pre-release then we do not allow it and we can
919
+ # short circuit that here.
920
+ # Note: This means that 1.0.dev1 would not be contained in something
921
+ # like >=1.0.devabc however it would be in >=1.0.debabc,>0.0.dev0
922
+ if not prereleases and item.is_prerelease:
923
+ return False
924
+
925
+ if installed and item.is_prerelease:
926
+ item = Version(item.base_version)
927
+
928
+ # We simply dispatch to the underlying specs here to make sure that the
929
+ # given version is contained within all of them.
930
+ # Note: This use of all() here means that an empty set of specifiers
931
+ # will always return True, this is an explicit design decision.
932
+ return all(s.contains(item, prereleases=prereleases) for s in self._specs)
933
+
934
+ def filter(
935
+ self, iterable: Iterable[UnparsedVersionVar], prereleases: Optional[bool] = None
936
+ ) -> Iterator[UnparsedVersionVar]:
937
+ """Filter items in the given iterable, that match the specifiers in this set.
938
+
939
+ :param iterable:
940
+ An iterable that can contain version strings and :class:`Version` instances.
941
+ The items in the iterable will be filtered according to the specifier.
942
+ :param prereleases:
943
+ Whether or not to allow prereleases in the returned iterator. If set to
944
+ ``None`` (the default), it will be intelligently decide whether to allow
945
+ prereleases or not (based on the :attr:`prereleases` attribute, and
946
+ whether the only versions matching are prereleases).
947
+
948
+ This method is smarter than just ``filter(SpecifierSet(...).contains, [...])``
949
+ because it implements the rule from :pep:`440` that a prerelease item
950
+ SHOULD be accepted if no other versions match the given specifier.
951
+
952
+ >>> list(SpecifierSet(">=1.2.3").filter(["1.2", "1.3", "1.5a1"]))
953
+ ['1.3']
954
+ >>> list(SpecifierSet(">=1.2.3").filter(["1.2", "1.3", Version("1.4")]))
955
+ ['1.3', <Version('1.4')>]
956
+ >>> list(SpecifierSet(">=1.2.3").filter(["1.2", "1.5a1"]))
957
+ []
958
+ >>> list(SpecifierSet(">=1.2.3").filter(["1.3", "1.5a1"], prereleases=True))
959
+ ['1.3', '1.5a1']
960
+ >>> list(SpecifierSet(">=1.2.3", prereleases=True).filter(["1.3", "1.5a1"]))
961
+ ['1.3', '1.5a1']
962
+
963
+ An "empty" SpecifierSet will filter items based on the presence of prerelease
964
+ versions in the set.
965
+
966
+ >>> list(SpecifierSet("").filter(["1.3", "1.5a1"]))
967
+ ['1.3']
968
+ >>> list(SpecifierSet("").filter(["1.5a1"]))
969
+ ['1.5a1']
970
+ >>> list(SpecifierSet("", prereleases=True).filter(["1.3", "1.5a1"]))
971
+ ['1.3', '1.5a1']
972
+ >>> list(SpecifierSet("").filter(["1.3", "1.5a1"], prereleases=True))
973
+ ['1.3', '1.5a1']
974
+ """
975
+ # Determine if we're forcing a prerelease or not, if we're not forcing
976
+ # one for this particular filter call, then we'll use whatever the
977
+ # SpecifierSet thinks for whether or not we should support prereleases.
978
+ if prereleases is None:
979
+ prereleases = self.prereleases
980
+
981
+ # If we have any specifiers, then we want to wrap our iterable in the
982
+ # filter method for each one, this will act as a logical AND amongst
983
+ # each specifier.
984
+ if self._specs:
985
+ for spec in self._specs:
986
+ iterable = spec.filter(iterable, prereleases=bool(prereleases))
987
+ return iter(iterable)
988
+ # If we do not have any specifiers, then we need to have a rough filter
989
+ # which will filter out any pre-releases, unless there are no final
990
+ # releases.
991
+ else:
992
+ filtered: List[UnparsedVersionVar] = []
993
+ found_prereleases: List[UnparsedVersionVar] = []
994
+
995
+ for item in iterable:
996
+ parsed_version = _coerce_version(item)
997
+
998
+ # Store any item which is a pre-release for later unless we've
999
+ # already found a final version or we are accepting prereleases
1000
+ if parsed_version.is_prerelease and not prereleases:
1001
+ if not filtered:
1002
+ found_prereleases.append(item)
1003
+ else:
1004
+ filtered.append(item)
1005
+
1006
+ # If we've found no items except for pre-releases, then we'll go
1007
+ # ahead and use the pre-releases
1008
+ if not filtered and found_prereleases and prereleases is None:
1009
+ return iter(found_prereleases)
1010
+
1011
+ return iter(filtered)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/tags.py ADDED
@@ -0,0 +1,571 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is dual licensed under the terms of the Apache License, Version
2
+ # 2.0, and the BSD License. See the LICENSE file in the root of this repository
3
+ # for complete details.
4
+
5
+ import logging
6
+ import platform
7
+ import re
8
+ import struct
9
+ import subprocess
10
+ import sys
11
+ import sysconfig
12
+ from importlib.machinery import EXTENSION_SUFFIXES
13
+ from typing import (
14
+ Dict,
15
+ FrozenSet,
16
+ Iterable,
17
+ Iterator,
18
+ List,
19
+ Optional,
20
+ Sequence,
21
+ Tuple,
22
+ Union,
23
+ cast,
24
+ )
25
+
26
+ from . import _manylinux, _musllinux
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ PythonVersion = Sequence[int]
31
+ MacVersion = Tuple[int, int]
32
+
33
+ INTERPRETER_SHORT_NAMES: Dict[str, str] = {
34
+ "python": "py", # Generic.
35
+ "cpython": "cp",
36
+ "pypy": "pp",
37
+ "ironpython": "ip",
38
+ "jython": "jy",
39
+ }
40
+
41
+
42
+ _32_BIT_INTERPRETER = struct.calcsize("P") == 4
43
+
44
+
45
+ class Tag:
46
+ """
47
+ A representation of the tag triple for a wheel.
48
+
49
+ Instances are considered immutable and thus are hashable. Equality checking
50
+ is also supported.
51
+ """
52
+
53
+ __slots__ = ["_interpreter", "_abi", "_platform", "_hash"]
54
+
55
+ def __init__(self, interpreter: str, abi: str, platform: str) -> None:
56
+ self._interpreter = interpreter.lower()
57
+ self._abi = abi.lower()
58
+ self._platform = platform.lower()
59
+ # The __hash__ of every single element in a Set[Tag] will be evaluated each time
60
+ # that a set calls its `.disjoint()` method, which may be called hundreds of
61
+ # times when scanning a page of links for packages with tags matching that
62
+ # Set[Tag]. Pre-computing the value here produces significant speedups for
63
+ # downstream consumers.
64
+ self._hash = hash((self._interpreter, self._abi, self._platform))
65
+
66
+ @property
67
+ def interpreter(self) -> str:
68
+ return self._interpreter
69
+
70
+ @property
71
+ def abi(self) -> str:
72
+ return self._abi
73
+
74
+ @property
75
+ def platform(self) -> str:
76
+ return self._platform
77
+
78
+ def __eq__(self, other: object) -> bool:
79
+ if not isinstance(other, Tag):
80
+ return NotImplemented
81
+
82
+ return (
83
+ (self._hash == other._hash) # Short-circuit ASAP for perf reasons.
84
+ and (self._platform == other._platform)
85
+ and (self._abi == other._abi)
86
+ and (self._interpreter == other._interpreter)
87
+ )
88
+
89
+ def __hash__(self) -> int:
90
+ return self._hash
91
+
92
+ def __str__(self) -> str:
93
+ return f"{self._interpreter}-{self._abi}-{self._platform}"
94
+
95
+ def __repr__(self) -> str:
96
+ return f"<{self} @ {id(self)}>"
97
+
98
+
99
+ def parse_tag(tag: str) -> FrozenSet[Tag]:
100
+ """
101
+ Parses the provided tag (e.g. `py3-none-any`) into a frozenset of Tag instances.
102
+
103
+ Returning a set is required due to the possibility that the tag is a
104
+ compressed tag set.
105
+ """
106
+ tags = set()
107
+ interpreters, abis, platforms = tag.split("-")
108
+ for interpreter in interpreters.split("."):
109
+ for abi in abis.split("."):
110
+ for platform_ in platforms.split("."):
111
+ tags.add(Tag(interpreter, abi, platform_))
112
+ return frozenset(tags)
113
+
114
+
115
+ def _get_config_var(name: str, warn: bool = False) -> Union[int, str, None]:
116
+ value: Union[int, str, None] = sysconfig.get_config_var(name)
117
+ if value is None and warn:
118
+ logger.debug(
119
+ "Config variable '%s' is unset, Python ABI tag may be incorrect", name
120
+ )
121
+ return value
122
+
123
+
124
+ def _normalize_string(string: str) -> str:
125
+ return string.replace(".", "_").replace("-", "_").replace(" ", "_")
126
+
127
+
128
+ def _is_threaded_cpython(abis: List[str]) -> bool:
129
+ """
130
+ Determine if the ABI corresponds to a threaded (`--disable-gil`) build.
131
+
132
+ The threaded builds are indicated by a "t" in the abiflags.
133
+ """
134
+ if len(abis) == 0:
135
+ return False
136
+ # expect e.g., cp313
137
+ m = re.match(r"cp\d+(.*)", abis[0])
138
+ if not m:
139
+ return False
140
+ abiflags = m.group(1)
141
+ return "t" in abiflags
142
+
143
+
144
+ def _abi3_applies(python_version: PythonVersion, threading: bool) -> bool:
145
+ """
146
+ Determine if the Python version supports abi3.
147
+
148
+ PEP 384 was first implemented in Python 3.2. The threaded (`--disable-gil`)
149
+ builds do not support abi3.
150
+ """
151
+ return len(python_version) > 1 and tuple(python_version) >= (3, 2) and not threading
152
+
153
+
154
+ def _cpython_abis(py_version: PythonVersion, warn: bool = False) -> List[str]:
155
+ py_version = tuple(py_version) # To allow for version comparison.
156
+ abis = []
157
+ version = _version_nodot(py_version[:2])
158
+ threading = debug = pymalloc = ucs4 = ""
159
+ with_debug = _get_config_var("Py_DEBUG", warn)
160
+ has_refcount = hasattr(sys, "gettotalrefcount")
161
+ # Windows doesn't set Py_DEBUG, so checking for support of debug-compiled
162
+ # extension modules is the best option.
163
+ # https://github.com/pypa/pip/issues/3383#issuecomment-173267692
164
+ has_ext = "_d.pyd" in EXTENSION_SUFFIXES
165
+ if with_debug or (with_debug is None and (has_refcount or has_ext)):
166
+ debug = "d"
167
+ if py_version >= (3, 13) and _get_config_var("Py_GIL_DISABLED", warn):
168
+ threading = "t"
169
+ if py_version < (3, 8):
170
+ with_pymalloc = _get_config_var("WITH_PYMALLOC", warn)
171
+ if with_pymalloc or with_pymalloc is None:
172
+ pymalloc = "m"
173
+ if py_version < (3, 3):
174
+ unicode_size = _get_config_var("Py_UNICODE_SIZE", warn)
175
+ if unicode_size == 4 or (
176
+ unicode_size is None and sys.maxunicode == 0x10FFFF
177
+ ):
178
+ ucs4 = "u"
179
+ elif debug:
180
+ # Debug builds can also load "normal" extension modules.
181
+ # We can also assume no UCS-4 or pymalloc requirement.
182
+ abis.append(f"cp{version}{threading}")
183
+ abis.insert(0, f"cp{version}{threading}{debug}{pymalloc}{ucs4}")
184
+ return abis
185
+
186
+
187
+ def cpython_tags(
188
+ python_version: Optional[PythonVersion] = None,
189
+ abis: Optional[Iterable[str]] = None,
190
+ platforms: Optional[Iterable[str]] = None,
191
+ *,
192
+ warn: bool = False,
193
+ ) -> Iterator[Tag]:
194
+ """
195
+ Yields the tags for a CPython interpreter.
196
+
197
+ The tags consist of:
198
+ - cp<python_version>-<abi>-<platform>
199
+ - cp<python_version>-abi3-<platform>
200
+ - cp<python_version>-none-<platform>
201
+ - cp<less than python_version>-abi3-<platform> # Older Python versions down to 3.2.
202
+
203
+ If python_version only specifies a major version then user-provided ABIs and
204
+ the 'none' ABItag will be used.
205
+
206
+ If 'abi3' or 'none' are specified in 'abis' then they will be yielded at
207
+ their normal position and not at the beginning.
208
+ """
209
+ if not python_version:
210
+ python_version = sys.version_info[:2]
211
+
212
+ interpreter = f"cp{_version_nodot(python_version[:2])}"
213
+
214
+ if abis is None:
215
+ if len(python_version) > 1:
216
+ abis = _cpython_abis(python_version, warn)
217
+ else:
218
+ abis = []
219
+ abis = list(abis)
220
+ # 'abi3' and 'none' are explicitly handled later.
221
+ for explicit_abi in ("abi3", "none"):
222
+ try:
223
+ abis.remove(explicit_abi)
224
+ except ValueError:
225
+ pass
226
+
227
+ platforms = list(platforms or platform_tags())
228
+ for abi in abis:
229
+ for platform_ in platforms:
230
+ yield Tag(interpreter, abi, platform_)
231
+
232
+ threading = _is_threaded_cpython(abis)
233
+ use_abi3 = _abi3_applies(python_version, threading)
234
+ if use_abi3:
235
+ yield from (Tag(interpreter, "abi3", platform_) for platform_ in platforms)
236
+ yield from (Tag(interpreter, "none", platform_) for platform_ in platforms)
237
+
238
+ if use_abi3:
239
+ for minor_version in range(python_version[1] - 1, 1, -1):
240
+ for platform_ in platforms:
241
+ interpreter = "cp{version}".format(
242
+ version=_version_nodot((python_version[0], minor_version))
243
+ )
244
+ yield Tag(interpreter, "abi3", platform_)
245
+
246
+
247
+ def _generic_abi() -> List[str]:
248
+ """
249
+ Return the ABI tag based on EXT_SUFFIX.
250
+ """
251
+ # The following are examples of `EXT_SUFFIX`.
252
+ # We want to keep the parts which are related to the ABI and remove the
253
+ # parts which are related to the platform:
254
+ # - linux: '.cpython-310-x86_64-linux-gnu.so' => cp310
255
+ # - mac: '.cpython-310-darwin.so' => cp310
256
+ # - win: '.cp310-win_amd64.pyd' => cp310
257
+ # - win: '.pyd' => cp37 (uses _cpython_abis())
258
+ # - pypy: '.pypy38-pp73-x86_64-linux-gnu.so' => pypy38_pp73
259
+ # - graalpy: '.graalpy-38-native-x86_64-darwin.dylib'
260
+ # => graalpy_38_native
261
+
262
+ ext_suffix = _get_config_var("EXT_SUFFIX", warn=True)
263
+ if not isinstance(ext_suffix, str) or ext_suffix[0] != ".":
264
+ raise SystemError("invalid sysconfig.get_config_var('EXT_SUFFIX')")
265
+ parts = ext_suffix.split(".")
266
+ if len(parts) < 3:
267
+ # CPython3.7 and earlier uses ".pyd" on Windows.
268
+ return _cpython_abis(sys.version_info[:2])
269
+ soabi = parts[1]
270
+ if soabi.startswith("cpython"):
271
+ # non-windows
272
+ abi = "cp" + soabi.split("-")[1]
273
+ elif soabi.startswith("cp"):
274
+ # windows
275
+ abi = soabi.split("-")[0]
276
+ elif soabi.startswith("pypy"):
277
+ abi = "-".join(soabi.split("-")[:2])
278
+ elif soabi.startswith("graalpy"):
279
+ abi = "-".join(soabi.split("-")[:3])
280
+ elif soabi:
281
+ # pyston, ironpython, others?
282
+ abi = soabi
283
+ else:
284
+ return []
285
+ return [_normalize_string(abi)]
286
+
287
+
288
+ def generic_tags(
289
+ interpreter: Optional[str] = None,
290
+ abis: Optional[Iterable[str]] = None,
291
+ platforms: Optional[Iterable[str]] = None,
292
+ *,
293
+ warn: bool = False,
294
+ ) -> Iterator[Tag]:
295
+ """
296
+ Yields the tags for a generic interpreter.
297
+
298
+ The tags consist of:
299
+ - <interpreter>-<abi>-<platform>
300
+
301
+ The "none" ABI will be added if it was not explicitly provided.
302
+ """
303
+ if not interpreter:
304
+ interp_name = interpreter_name()
305
+ interp_version = interpreter_version(warn=warn)
306
+ interpreter = "".join([interp_name, interp_version])
307
+ if abis is None:
308
+ abis = _generic_abi()
309
+ else:
310
+ abis = list(abis)
311
+ platforms = list(platforms or platform_tags())
312
+ if "none" not in abis:
313
+ abis.append("none")
314
+ for abi in abis:
315
+ for platform_ in platforms:
316
+ yield Tag(interpreter, abi, platform_)
317
+
318
+
319
+ def _py_interpreter_range(py_version: PythonVersion) -> Iterator[str]:
320
+ """
321
+ Yields Python versions in descending order.
322
+
323
+ After the latest version, the major-only version will be yielded, and then
324
+ all previous versions of that major version.
325
+ """
326
+ if len(py_version) > 1:
327
+ yield f"py{_version_nodot(py_version[:2])}"
328
+ yield f"py{py_version[0]}"
329
+ if len(py_version) > 1:
330
+ for minor in range(py_version[1] - 1, -1, -1):
331
+ yield f"py{_version_nodot((py_version[0], minor))}"
332
+
333
+
334
+ def compatible_tags(
335
+ python_version: Optional[PythonVersion] = None,
336
+ interpreter: Optional[str] = None,
337
+ platforms: Optional[Iterable[str]] = None,
338
+ ) -> Iterator[Tag]:
339
+ """
340
+ Yields the sequence of tags that are compatible with a specific version of Python.
341
+
342
+ The tags consist of:
343
+ - py*-none-<platform>
344
+ - <interpreter>-none-any # ... if `interpreter` is provided.
345
+ - py*-none-any
346
+ """
347
+ if not python_version:
348
+ python_version = sys.version_info[:2]
349
+ platforms = list(platforms or platform_tags())
350
+ for version in _py_interpreter_range(python_version):
351
+ for platform_ in platforms:
352
+ yield Tag(version, "none", platform_)
353
+ if interpreter:
354
+ yield Tag(interpreter, "none", "any")
355
+ for version in _py_interpreter_range(python_version):
356
+ yield Tag(version, "none", "any")
357
+
358
+
359
+ def _mac_arch(arch: str, is_32bit: bool = _32_BIT_INTERPRETER) -> str:
360
+ if not is_32bit:
361
+ return arch
362
+
363
+ if arch.startswith("ppc"):
364
+ return "ppc"
365
+
366
+ return "i386"
367
+
368
+
369
+ def _mac_binary_formats(version: MacVersion, cpu_arch: str) -> List[str]:
370
+ formats = [cpu_arch]
371
+ if cpu_arch == "x86_64":
372
+ if version < (10, 4):
373
+ return []
374
+ formats.extend(["intel", "fat64", "fat32"])
375
+
376
+ elif cpu_arch == "i386":
377
+ if version < (10, 4):
378
+ return []
379
+ formats.extend(["intel", "fat32", "fat"])
380
+
381
+ elif cpu_arch == "ppc64":
382
+ # TODO: Need to care about 32-bit PPC for ppc64 through 10.2?
383
+ if version > (10, 5) or version < (10, 4):
384
+ return []
385
+ formats.append("fat64")
386
+
387
+ elif cpu_arch == "ppc":
388
+ if version > (10, 6):
389
+ return []
390
+ formats.extend(["fat32", "fat"])
391
+
392
+ if cpu_arch in {"arm64", "x86_64"}:
393
+ formats.append("universal2")
394
+
395
+ if cpu_arch in {"x86_64", "i386", "ppc64", "ppc", "intel"}:
396
+ formats.append("universal")
397
+
398
+ return formats
399
+
400
+
401
+ def mac_platforms(
402
+ version: Optional[MacVersion] = None, arch: Optional[str] = None
403
+ ) -> Iterator[str]:
404
+ """
405
+ Yields the platform tags for a macOS system.
406
+
407
+ The `version` parameter is a two-item tuple specifying the macOS version to
408
+ generate platform tags for. The `arch` parameter is the CPU architecture to
409
+ generate platform tags for. Both parameters default to the appropriate value
410
+ for the current system.
411
+ """
412
+ version_str, _, cpu_arch = platform.mac_ver()
413
+ if version is None:
414
+ version = cast("MacVersion", tuple(map(int, version_str.split(".")[:2])))
415
+ if version == (10, 16):
416
+ # When built against an older macOS SDK, Python will report macOS 10.16
417
+ # instead of the real version.
418
+ version_str = subprocess.run(
419
+ [
420
+ sys.executable,
421
+ "-sS",
422
+ "-c",
423
+ "import platform; print(platform.mac_ver()[0])",
424
+ ],
425
+ check=True,
426
+ env={"SYSTEM_VERSION_COMPAT": "0"},
427
+ stdout=subprocess.PIPE,
428
+ text=True,
429
+ ).stdout
430
+ version = cast("MacVersion", tuple(map(int, version_str.split(".")[:2])))
431
+ else:
432
+ version = version
433
+ if arch is None:
434
+ arch = _mac_arch(cpu_arch)
435
+ else:
436
+ arch = arch
437
+
438
+ if (10, 0) <= version and version < (11, 0):
439
+ # Prior to Mac OS 11, each yearly release of Mac OS bumped the
440
+ # "minor" version number. The major version was always 10.
441
+ for minor_version in range(version[1], -1, -1):
442
+ compat_version = 10, minor_version
443
+ binary_formats = _mac_binary_formats(compat_version, arch)
444
+ for binary_format in binary_formats:
445
+ yield "macosx_{major}_{minor}_{binary_format}".format(
446
+ major=10, minor=minor_version, binary_format=binary_format
447
+ )
448
+
449
+ if version >= (11, 0):
450
+ # Starting with Mac OS 11, each yearly release bumps the major version
451
+ # number. The minor versions are now the midyear updates.
452
+ for major_version in range(version[0], 10, -1):
453
+ compat_version = major_version, 0
454
+ binary_formats = _mac_binary_formats(compat_version, arch)
455
+ for binary_format in binary_formats:
456
+ yield "macosx_{major}_{minor}_{binary_format}".format(
457
+ major=major_version, minor=0, binary_format=binary_format
458
+ )
459
+
460
+ if version >= (11, 0):
461
+ # Mac OS 11 on x86_64 is compatible with binaries from previous releases.
462
+ # Arm64 support was introduced in 11.0, so no Arm binaries from previous
463
+ # releases exist.
464
+ #
465
+ # However, the "universal2" binary format can have a
466
+ # macOS version earlier than 11.0 when the x86_64 part of the binary supports
467
+ # that version of macOS.
468
+ if arch == "x86_64":
469
+ for minor_version in range(16, 3, -1):
470
+ compat_version = 10, minor_version
471
+ binary_formats = _mac_binary_formats(compat_version, arch)
472
+ for binary_format in binary_formats:
473
+ yield "macosx_{major}_{minor}_{binary_format}".format(
474
+ major=compat_version[0],
475
+ minor=compat_version[1],
476
+ binary_format=binary_format,
477
+ )
478
+ else:
479
+ for minor_version in range(16, 3, -1):
480
+ compat_version = 10, minor_version
481
+ binary_format = "universal2"
482
+ yield "macosx_{major}_{minor}_{binary_format}".format(
483
+ major=compat_version[0],
484
+ minor=compat_version[1],
485
+ binary_format=binary_format,
486
+ )
487
+
488
+
489
+ def _linux_platforms(is_32bit: bool = _32_BIT_INTERPRETER) -> Iterator[str]:
490
+ linux = _normalize_string(sysconfig.get_platform())
491
+ if not linux.startswith("linux_"):
492
+ # we should never be here, just yield the sysconfig one and return
493
+ yield linux
494
+ return
495
+ if is_32bit:
496
+ if linux == "linux_x86_64":
497
+ linux = "linux_i686"
498
+ elif linux == "linux_aarch64":
499
+ linux = "linux_armv8l"
500
+ _, arch = linux.split("_", 1)
501
+ archs = {"armv8l": ["armv8l", "armv7l"]}.get(arch, [arch])
502
+ yield from _manylinux.platform_tags(archs)
503
+ yield from _musllinux.platform_tags(archs)
504
+ for arch in archs:
505
+ yield f"linux_{arch}"
506
+
507
+
508
+ def _generic_platforms() -> Iterator[str]:
509
+ yield _normalize_string(sysconfig.get_platform())
510
+
511
+
512
+ def platform_tags() -> Iterator[str]:
513
+ """
514
+ Provides the platform tags for this installation.
515
+ """
516
+ if platform.system() == "Darwin":
517
+ return mac_platforms()
518
+ elif platform.system() == "Linux":
519
+ return _linux_platforms()
520
+ else:
521
+ return _generic_platforms()
522
+
523
+
524
+ def interpreter_name() -> str:
525
+ """
526
+ Returns the name of the running interpreter.
527
+
528
+ Some implementations have a reserved, two-letter abbreviation which will
529
+ be returned when appropriate.
530
+ """
531
+ name = sys.implementation.name
532
+ return INTERPRETER_SHORT_NAMES.get(name) or name
533
+
534
+
535
+ def interpreter_version(*, warn: bool = False) -> str:
536
+ """
537
+ Returns the version of the running interpreter.
538
+ """
539
+ version = _get_config_var("py_version_nodot", warn=warn)
540
+ if version:
541
+ version = str(version)
542
+ else:
543
+ version = _version_nodot(sys.version_info[:2])
544
+ return version
545
+
546
+
547
+ def _version_nodot(version: PythonVersion) -> str:
548
+ return "".join(map(str, version))
549
+
550
+
551
+ def sys_tags(*, warn: bool = False) -> Iterator[Tag]:
552
+ """
553
+ Returns the sequence of tag triples for the running interpreter.
554
+
555
+ The order of the sequence corresponds to priority order for the
556
+ interpreter, from most to least important.
557
+ """
558
+
559
+ interp_name = interpreter_name()
560
+ if interp_name == "cp":
561
+ yield from cpython_tags(warn=warn)
562
+ else:
563
+ yield from generic_tags()
564
+
565
+ if interp_name == "pp":
566
+ interp = "pp3"
567
+ elif interp_name == "cp":
568
+ interp = "cp" + interpreter_version(warn=warn)
569
+ else:
570
+ interp = None
571
+ yield from compatible_tags(interpreter=interp)
Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/wheel/vendored/packaging/utils.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file is dual licensed under the terms of the Apache License, Version
2
+ # 2.0, and the BSD License. See the LICENSE file in the root of this repository
3
+ # for complete details.
4
+
5
+ import re
6
+ from typing import FrozenSet, NewType, Tuple, Union, cast
7
+
8
+ from .tags import Tag, parse_tag
9
+ from .version import InvalidVersion, Version
10
+
11
+ BuildTag = Union[Tuple[()], Tuple[int, str]]
12
+ NormalizedName = NewType("NormalizedName", str)
13
+
14
+
15
+ class InvalidName(ValueError):
16
+ """
17
+ An invalid distribution name; users should refer to the packaging user guide.
18
+ """
19
+
20
+
21
+ class InvalidWheelFilename(ValueError):
22
+ """
23
+ An invalid wheel filename was found, users should refer to PEP 427.
24
+ """
25
+
26
+
27
+ class InvalidSdistFilename(ValueError):
28
+ """
29
+ An invalid sdist filename was found, users should refer to the packaging user guide.
30
+ """
31
+
32
+
33
+ # Core metadata spec for `Name`
34
+ _validate_regex = re.compile(
35
+ r"^([A-Z0-9]|[A-Z0-9][A-Z0-9._-]*[A-Z0-9])$", re.IGNORECASE
36
+ )
37
+ _canonicalize_regex = re.compile(r"[-_.]+")
38
+ _normalized_regex = re.compile(r"^([a-z0-9]|[a-z0-9]([a-z0-9-](?!--))*[a-z0-9])$")
39
+ # PEP 427: The build number must start with a digit.
40
+ _build_tag_regex = re.compile(r"(\d+)(.*)")
41
+
42
+
43
+ def canonicalize_name(name: str, *, validate: bool = False) -> NormalizedName:
44
+ if validate and not _validate_regex.match(name):
45
+ raise InvalidName(f"name is invalid: {name!r}")
46
+ # This is taken from PEP 503.
47
+ value = _canonicalize_regex.sub("-", name).lower()
48
+ return cast(NormalizedName, value)
49
+
50
+
51
+ def is_normalized_name(name: str) -> bool:
52
+ return _normalized_regex.match(name) is not None
53
+
54
+
55
+ def canonicalize_version(
56
+ version: Union[Version, str], *, strip_trailing_zero: bool = True
57
+ ) -> str:
58
+ """
59
+ This is very similar to Version.__str__, but has one subtle difference
60
+ with the way it handles the release segment.
61
+ """
62
+ if isinstance(version, str):
63
+ try:
64
+ parsed = Version(version)
65
+ except InvalidVersion:
66
+ # Legacy versions cannot be normalized
67
+ return version
68
+ else:
69
+ parsed = version
70
+
71
+ parts = []
72
+
73
+ # Epoch
74
+ if parsed.epoch != 0:
75
+ parts.append(f"{parsed.epoch}!")
76
+
77
+ # Release segment
78
+ release_segment = ".".join(str(x) for x in parsed.release)
79
+ if strip_trailing_zero:
80
+ # NB: This strips trailing '.0's to normalize
81
+ release_segment = re.sub(r"(\.0)+$", "", release_segment)
82
+ parts.append(release_segment)
83
+
84
+ # Pre-release
85
+ if parsed.pre is not None:
86
+ parts.append("".join(str(x) for x in parsed.pre))
87
+
88
+ # Post-release
89
+ if parsed.post is not None:
90
+ parts.append(f".post{parsed.post}")
91
+
92
+ # Development release
93
+ if parsed.dev is not None:
94
+ parts.append(f".dev{parsed.dev}")
95
+
96
+ # Local version segment
97
+ if parsed.local is not None:
98
+ parts.append(f"+{parsed.local}")
99
+
100
+ return "".join(parts)
101
+
102
+
103
+ def parse_wheel_filename(
104
+ filename: str,
105
+ ) -> Tuple[NormalizedName, Version, BuildTag, FrozenSet[Tag]]:
106
+ if not filename.endswith(".whl"):
107
+ raise InvalidWheelFilename(
108
+ f"Invalid wheel filename (extension must be '.whl'): {filename}"
109
+ )
110
+
111
+ filename = filename[:-4]
112
+ dashes = filename.count("-")
113
+ if dashes not in (4, 5):
114
+ raise InvalidWheelFilename(
115
+ f"Invalid wheel filename (wrong number of parts): {filename}"
116
+ )
117
+
118
+ parts = filename.split("-", dashes - 2)
119
+ name_part = parts[0]
120
+ # See PEP 427 for the rules on escaping the project name.
121
+ if "__" in name_part or re.match(r"^[\w\d._]*$", name_part, re.UNICODE) is None:
122
+ raise InvalidWheelFilename(f"Invalid project name: {filename}")
123
+ name = canonicalize_name(name_part)
124
+
125
+ try:
126
+ version = Version(parts[1])
127
+ except InvalidVersion as e:
128
+ raise InvalidWheelFilename(
129
+ f"Invalid wheel filename (invalid version): {filename}"
130
+ ) from e
131
+
132
+ if dashes == 5:
133
+ build_part = parts[2]
134
+ build_match = _build_tag_regex.match(build_part)
135
+ if build_match is None:
136
+ raise InvalidWheelFilename(
137
+ f"Invalid build number: {build_part} in '{filename}'"
138
+ )
139
+ build = cast(BuildTag, (int(build_match.group(1)), build_match.group(2)))
140
+ else:
141
+ build = ()
142
+ tags = parse_tag(parts[-1])
143
+ return (name, version, build, tags)
144
+
145
+
146
+ def parse_sdist_filename(filename: str) -> Tuple[NormalizedName, Version]:
147
+ if filename.endswith(".tar.gz"):
148
+ file_stem = filename[: -len(".tar.gz")]
149
+ elif filename.endswith(".zip"):
150
+ file_stem = filename[: -len(".zip")]
151
+ else:
152
+ raise InvalidSdistFilename(
153
+ f"Invalid sdist filename (extension must be '.tar.gz' or '.zip'):"
154
+ f" {filename}"
155
+ )
156
+
157
+ # We are requiring a PEP 440 version, which cannot contain dashes,
158
+ # so we split on the last dash.
159
+ name_part, sep, version_part = file_stem.rpartition("-")
160
+ if not sep:
161
+ raise InvalidSdistFilename(f"Invalid sdist filename: {filename}")
162
+
163
+ name = canonicalize_name(name_part)
164
+
165
+ try:
166
+ version = Version(version_part)
167
+ except InvalidVersion as e:
168
+ raise InvalidSdistFilename(
169
+ f"Invalid sdist filename (invalid version): {filename}"
170
+ ) from e
171
+
172
+ return (name, version)