koichi12 commited on
Commit
bdf6bb8
·
verified ·
1 Parent(s): a249ee4

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc +0 -0
  2. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc +0 -0
  3. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc +0 -0
  4. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc +0 -0
  5. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc +0 -0
  6. tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc +0 -0
  7. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h +891 -0
  8. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h +693 -0
  9. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h +824 -0
  10. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc +0 -0
  11. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py +0 -0
  12. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc +0 -0
  13. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py +0 -0
  14. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc +0 -0
  15. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h +758 -0
  16. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc +0 -0
  17. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h +348 -0
  18. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h +227 -0
  19. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h +350 -0
  20. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h +94 -0
  21. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp +0 -0
  22. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h +1958 -0
  23. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h +224 -0
  24. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h +0 -0
  25. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h +109 -0
  26. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp +224 -0
  27. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h +65 -0
  28. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h +118 -0
  29. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h +145 -0
  30. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_types.h +0 -0
  31. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_config.h +65 -0
  32. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/library_types.h +103 -0
  33. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_functions.h +65 -0
  34. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_32_atomic_functions.hpp +134 -0
  35. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_60_atomic_functions.hpp +527 -0
  36. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_functions.h +439 -0
  37. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_fetch_functions.h +739 -0
  38. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc +0 -0
  39. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h +78 -0
  40. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer.h +658 -0
  41. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py +0 -0
  42. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h +0 -0
  43. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc +0 -0
  44. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc +0 -0
  45. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h +448 -0
  46. tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__init__.py +0 -0
  47. tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/_cmd.py +70 -0
  48. tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/adapter.py +161 -0
  49. tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/cache.py +74 -0
  50. tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/controller.py +499 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc ADDED
Binary file (13.6 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc ADDED
Binary file (11.3 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc ADDED
Binary file (46.4 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc ADDED
Binary file (25.4 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc ADDED
Binary file (67.5 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc ADDED
Binary file (6.66 kB). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h ADDED
@@ -0,0 +1,891 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /*
51
+ * This is the public header file for the CUBLAS library, defining the API
52
+ *
53
+ * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
54
+ * on top of the CUDA runtime.
55
+ */
56
+
57
+ #if !defined(CUBLAS_H_)
58
+ #define CUBLAS_H_
59
+
60
+ #if defined(CUBLAS_V2_H_)
61
+ #error "It is an error to include both cublas.h and cublas_v2.h"
62
+ #endif
63
+
64
+ #include <cuda_runtime.h>
65
+
66
+ #ifndef CUBLASWINAPI
67
+ #ifdef _WIN32
68
+ #define CUBLASWINAPI __stdcall
69
+ #else
70
+ #define CUBLASWINAPI
71
+ #endif
72
+ #endif
73
+
74
+ #undef CUBLASAPI
75
+ #ifdef __CUDACC__
76
+ #define CUBLASAPI __host__
77
+ #else
78
+ #define CUBLASAPI
79
+ #endif
80
+
81
+ #include "cublas_api.h"
82
+
83
+ #if defined(__cplusplus)
84
+ extern "C" {
85
+ #endif
86
+
87
+ /* CUBLAS data types */
88
+ #define cublasStatus cublasStatus_t
89
+
90
+ cublasStatus CUBLASWINAPI cublasInit(void);
91
+ cublasStatus CUBLASWINAPI cublasShutdown(void);
92
+ cublasStatus CUBLASWINAPI cublasGetError(void);
93
+
94
+ cublasStatus CUBLASWINAPI cublasGetVersion(int* version);
95
+ cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void** devicePtr);
96
+
97
+ cublasStatus CUBLASWINAPI cublasFree(void* devicePtr);
98
+
99
+ cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream);
100
+
101
+ /* ---------------- CUBLAS BLAS1 functions ---------------- */
102
+ /* NRM2 */
103
+ float CUBLASWINAPI cublasSnrm2(int n, const float* x, int incx);
104
+ double CUBLASWINAPI cublasDnrm2(int n, const double* x, int incx);
105
+ float CUBLASWINAPI cublasScnrm2(int n, const cuComplex* x, int incx);
106
+ double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex* x, int incx);
107
+ /*------------------------------------------------------------------------*/
108
+ /* DOT */
109
+ float CUBLASWINAPI cublasSdot(int n, const float* x, int incx, const float* y, int incy);
110
+ double CUBLASWINAPI cublasDdot(int n, const double* x, int incx, const double* y, int incy);
111
+ cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
112
+ cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
113
+ cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
114
+ cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
115
+ /*------------------------------------------------------------------------*/
116
+ /* SCAL */
117
+ void CUBLASWINAPI cublasSscal(int n, float alpha, float* x, int incx);
118
+ void CUBLASWINAPI cublasDscal(int n, double alpha, double* x, int incx);
119
+ void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex* x, int incx);
120
+ void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex* x, int incx);
121
+
122
+ void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex* x, int incx);
123
+ void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex* x, int incx);
124
+ /*------------------------------------------------------------------------*/
125
+ /* AXPY */
126
+ void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float* x, int incx, float* y, int incy);
127
+ void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double* x, int incx, double* y, int incy);
128
+ void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex* x, int incx, cuComplex* y, int incy);
129
+ void CUBLASWINAPI
130
+ cublasZaxpy(int n, cuDoubleComplex alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
131
+ /*------------------------------------------------------------------------*/
132
+ /* COPY */
133
+ void CUBLASWINAPI cublasScopy(int n, const float* x, int incx, float* y, int incy);
134
+ void CUBLASWINAPI cublasDcopy(int n, const double* x, int incx, double* y, int incy);
135
+ void CUBLASWINAPI cublasCcopy(int n, const cuComplex* x, int incx, cuComplex* y, int incy);
136
+ void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
137
+ /*------------------------------------------------------------------------*/
138
+ /* SWAP */
139
+ void CUBLASWINAPI cublasSswap(int n, float* x, int incx, float* y, int incy);
140
+ void CUBLASWINAPI cublasDswap(int n, double* x, int incx, double* y, int incy);
141
+ void CUBLASWINAPI cublasCswap(int n, cuComplex* x, int incx, cuComplex* y, int incy);
142
+ void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
143
+ /*------------------------------------------------------------------------*/
144
+ /* AMAX */
145
+ int CUBLASWINAPI cublasIsamax(int n, const float* x, int incx);
146
+ int CUBLASWINAPI cublasIdamax(int n, const double* x, int incx);
147
+ int CUBLASWINAPI cublasIcamax(int n, const cuComplex* x, int incx);
148
+ int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex* x, int incx);
149
+ /*------------------------------------------------------------------------*/
150
+ /* AMIN */
151
+ int CUBLASWINAPI cublasIsamin(int n, const float* x, int incx);
152
+ int CUBLASWINAPI cublasIdamin(int n, const double* x, int incx);
153
+
154
+ int CUBLASWINAPI cublasIcamin(int n, const cuComplex* x, int incx);
155
+ int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex* x, int incx);
156
+ /*------------------------------------------------------------------------*/
157
+ /* ASUM */
158
+ float CUBLASWINAPI cublasSasum(int n, const float* x, int incx);
159
+ double CUBLASWINAPI cublasDasum(int n, const double* x, int incx);
160
+ float CUBLASWINAPI cublasScasum(int n, const cuComplex* x, int incx);
161
+ double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex* x, int incx);
162
+ /*------------------------------------------------------------------------*/
163
+ /* ROT */
164
+ void CUBLASWINAPI cublasSrot(int n, float* x, int incx, float* y, int incy, float sc, float ss);
165
+ void CUBLASWINAPI cublasDrot(int n, double* x, int incx, double* y, int incy, double sc, double ss);
166
+ void CUBLASWINAPI cublasCrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, cuComplex s);
167
+ void CUBLASWINAPI
168
+ cublasZrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double sc, cuDoubleComplex cs);
169
+ void CUBLASWINAPI cublasCsrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, float s);
170
+ void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double c, double s);
171
+ /*------------------------------------------------------------------------*/
172
+ /* ROTG */
173
+ void CUBLASWINAPI cublasSrotg(float* sa, float* sb, float* sc, float* ss);
174
+ void CUBLASWINAPI cublasDrotg(double* sa, double* sb, double* sc, double* ss);
175
+ void CUBLASWINAPI cublasCrotg(cuComplex* ca, cuComplex cb, float* sc, cuComplex* cs);
176
+ void CUBLASWINAPI cublasZrotg(cuDoubleComplex* ca, cuDoubleComplex cb, double* sc, cuDoubleComplex* cs);
177
+ /*------------------------------------------------------------------------*/
178
+ /* ROTM */
179
+ void CUBLASWINAPI cublasSrotm(int n, float* x, int incx, float* y, int incy, const float* sparam);
180
+ void CUBLASWINAPI cublasDrotm(int n, double* x, int incx, double* y, int incy, const double* sparam);
181
+ /*------------------------------------------------------------------------*/
182
+ /* ROTMG */
183
+ void CUBLASWINAPI cublasSrotmg(float* sd1, float* sd2, float* sx1, const float* sy1, float* sparam);
184
+ void CUBLASWINAPI cublasDrotmg(double* sd1, double* sd2, double* sx1, const double* sy1, double* sparam);
185
+
186
+ /* --------------- CUBLAS BLAS2 functions ---------------- */
187
+ /* GEMV */
188
+ void CUBLASWINAPI cublasSgemv(char trans,
189
+ int m,
190
+ int n,
191
+ float alpha,
192
+ const float* A,
193
+ int lda,
194
+ const float* x,
195
+ int incx,
196
+ float beta,
197
+ float* y,
198
+ int incy);
199
+ void CUBLASWINAPI cublasDgemv(char trans,
200
+ int m,
201
+ int n,
202
+ double alpha,
203
+ const double* A,
204
+ int lda,
205
+ const double* x,
206
+ int incx,
207
+ double beta,
208
+ double* y,
209
+ int incy);
210
+ void CUBLASWINAPI cublasCgemv(char trans,
211
+ int m,
212
+ int n,
213
+ cuComplex alpha,
214
+ const cuComplex* A,
215
+ int lda,
216
+ const cuComplex* x,
217
+ int incx,
218
+ cuComplex beta,
219
+ cuComplex* y,
220
+ int incy);
221
+ void CUBLASWINAPI cublasZgemv(char trans,
222
+ int m,
223
+ int n,
224
+ cuDoubleComplex alpha,
225
+ const cuDoubleComplex* A,
226
+ int lda,
227
+ const cuDoubleComplex* x,
228
+ int incx,
229
+ cuDoubleComplex beta,
230
+ cuDoubleComplex* y,
231
+ int incy);
232
+ /*------------------------------------------------------------------------*/
233
+ /* GBMV */
234
+ void CUBLASWINAPI cublasSgbmv(char trans,
235
+ int m,
236
+ int n,
237
+ int kl,
238
+ int ku,
239
+ float alpha,
240
+ const float* A,
241
+ int lda,
242
+ const float* x,
243
+ int incx,
244
+ float beta,
245
+ float* y,
246
+ int incy);
247
+ void CUBLASWINAPI cublasDgbmv(char trans,
248
+ int m,
249
+ int n,
250
+ int kl,
251
+ int ku,
252
+ double alpha,
253
+ const double* A,
254
+ int lda,
255
+ const double* x,
256
+ int incx,
257
+ double beta,
258
+ double* y,
259
+ int incy);
260
+ void CUBLASWINAPI cublasCgbmv(char trans,
261
+ int m,
262
+ int n,
263
+ int kl,
264
+ int ku,
265
+ cuComplex alpha,
266
+ const cuComplex* A,
267
+ int lda,
268
+ const cuComplex* x,
269
+ int incx,
270
+ cuComplex beta,
271
+ cuComplex* y,
272
+ int incy);
273
+ void CUBLASWINAPI cublasZgbmv(char trans,
274
+ int m,
275
+ int n,
276
+ int kl,
277
+ int ku,
278
+ cuDoubleComplex alpha,
279
+ const cuDoubleComplex* A,
280
+ int lda,
281
+ const cuDoubleComplex* x,
282
+ int incx,
283
+ cuDoubleComplex beta,
284
+ cuDoubleComplex* y,
285
+ int incy);
286
+ /*------------------------------------------------------------------------*/
287
+ /* TRMV */
288
+ void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
289
+ void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
290
+ void CUBLASWINAPI
291
+ cublasCtrmv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
292
+ void CUBLASWINAPI
293
+ cublasZtrmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
294
+ /*------------------------------------------------------------------------*/
295
+ /* TBMV */
296
+ void CUBLASWINAPI
297
+ cublasStbmv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
298
+ void CUBLASWINAPI
299
+ cublasDtbmv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
300
+ void CUBLASWINAPI
301
+ cublasCtbmv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
302
+ void CUBLASWINAPI cublasZtbmv(
303
+ char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
304
+ /*------------------------------------------------------------------------*/
305
+ /* TPMV */
306
+ void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
307
+
308
+ void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
309
+
310
+ void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
311
+
312
+ void CUBLASWINAPI
313
+ cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
314
+ /*------------------------------------------------------------------------*/
315
+ /* TRSV */
316
+ void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
317
+
318
+ void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
319
+
320
+ void CUBLASWINAPI
321
+ cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
322
+
323
+ void CUBLASWINAPI
324
+ cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
325
+ /*------------------------------------------------------------------------*/
326
+ /* TPSV */
327
+ void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
328
+
329
+ void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
330
+
331
+ void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
332
+
333
+ void CUBLASWINAPI
334
+ cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
335
+ /*------------------------------------------------------------------------*/
336
+ /* TBSV */
337
+ void CUBLASWINAPI
338
+ cublasStbsv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
339
+
340
+ void CUBLASWINAPI
341
+ cublasDtbsv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
342
+ void CUBLASWINAPI
343
+ cublasCtbsv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
344
+
345
+ void CUBLASWINAPI cublasZtbsv(
346
+ char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
347
+ /*------------------------------------------------------------------------*/
348
+ /* SYMV/HEMV */
349
+ void CUBLASWINAPI cublasSsymv(
350
+ char uplo, int n, float alpha, const float* A, int lda, const float* x, int incx, float beta, float* y, int incy);
351
+ void CUBLASWINAPI cublasDsymv(char uplo,
352
+ int n,
353
+ double alpha,
354
+ const double* A,
355
+ int lda,
356
+ const double* x,
357
+ int incx,
358
+ double beta,
359
+ double* y,
360
+ int incy);
361
+ void CUBLASWINAPI cublasChemv(char uplo,
362
+ int n,
363
+ cuComplex alpha,
364
+ const cuComplex* A,
365
+ int lda,
366
+ const cuComplex* x,
367
+ int incx,
368
+ cuComplex beta,
369
+ cuComplex* y,
370
+ int incy);
371
+ void CUBLASWINAPI cublasZhemv(char uplo,
372
+ int n,
373
+ cuDoubleComplex alpha,
374
+ const cuDoubleComplex* A,
375
+ int lda,
376
+ const cuDoubleComplex* x,
377
+ int incx,
378
+ cuDoubleComplex beta,
379
+ cuDoubleComplex* y,
380
+ int incy);
381
+ /*------------------------------------------------------------------------*/
382
+ /* SBMV/HBMV */
383
+ void CUBLASWINAPI cublasSsbmv(char uplo,
384
+ int n,
385
+ int k,
386
+ float alpha,
387
+ const float* A,
388
+ int lda,
389
+ const float* x,
390
+ int incx,
391
+ float beta,
392
+ float* y,
393
+ int incy);
394
+ void CUBLASWINAPI cublasDsbmv(char uplo,
395
+ int n,
396
+ int k,
397
+ double alpha,
398
+ const double* A,
399
+ int lda,
400
+ const double* x,
401
+ int incx,
402
+ double beta,
403
+ double* y,
404
+ int incy);
405
+ void CUBLASWINAPI cublasChbmv(char uplo,
406
+ int n,
407
+ int k,
408
+ cuComplex alpha,
409
+ const cuComplex* A,
410
+ int lda,
411
+ const cuComplex* x,
412
+ int incx,
413
+ cuComplex beta,
414
+ cuComplex* y,
415
+ int incy);
416
+ void CUBLASWINAPI cublasZhbmv(char uplo,
417
+ int n,
418
+ int k,
419
+ cuDoubleComplex alpha,
420
+ const cuDoubleComplex* A,
421
+ int lda,
422
+ const cuDoubleComplex* x,
423
+ int incx,
424
+ cuDoubleComplex beta,
425
+ cuDoubleComplex* y,
426
+ int incy);
427
+ /*------------------------------------------------------------------------*/
428
+ /* SPMV/HPMV */
429
+ void CUBLASWINAPI
430
+ cublasSspmv(char uplo, int n, float alpha, const float* AP, const float* x, int incx, float beta, float* y, int incy);
431
+ void CUBLASWINAPI cublasDspmv(
432
+ char uplo, int n, double alpha, const double* AP, const double* x, int incx, double beta, double* y, int incy);
433
+ void CUBLASWINAPI cublasChpmv(char uplo,
434
+ int n,
435
+ cuComplex alpha,
436
+ const cuComplex* AP,
437
+ const cuComplex* x,
438
+ int incx,
439
+ cuComplex beta,
440
+ cuComplex* y,
441
+ int incy);
442
+ void CUBLASWINAPI cublasZhpmv(char uplo,
443
+ int n,
444
+ cuDoubleComplex alpha,
445
+ const cuDoubleComplex* AP,
446
+ const cuDoubleComplex* x,
447
+ int incx,
448
+ cuDoubleComplex beta,
449
+ cuDoubleComplex* y,
450
+ int incy);
451
+
452
+ /*------------------------------------------------------------------------*/
453
+ /* GER */
454
+ void CUBLASWINAPI
455
+ cublasSger(int m, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
456
+ void CUBLASWINAPI
457
+ cublasDger(int m, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
458
+
459
+ void CUBLASWINAPI cublasCgeru(
460
+ int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
461
+ void CUBLASWINAPI cublasCgerc(
462
+ int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
463
+ void CUBLASWINAPI cublasZgeru(int m,
464
+ int n,
465
+ cuDoubleComplex alpha,
466
+ const cuDoubleComplex* x,
467
+ int incx,
468
+ const cuDoubleComplex* y,
469
+ int incy,
470
+ cuDoubleComplex* A,
471
+ int lda);
472
+ void CUBLASWINAPI cublasZgerc(int m,
473
+ int n,
474
+ cuDoubleComplex alpha,
475
+ const cuDoubleComplex* x,
476
+ int incx,
477
+ const cuDoubleComplex* y,
478
+ int incy,
479
+ cuDoubleComplex* A,
480
+ int lda);
481
+ /*------------------------------------------------------------------------*/
482
+ /* SYR/HER */
483
+ void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float* x, int incx, float* A, int lda);
484
+ void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double* x, int incx, double* A, int lda);
485
+
486
+ void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* A, int lda);
487
+ void CUBLASWINAPI
488
+ cublasZher(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda);
489
+
490
+ /*------------------------------------------------------------------------*/
491
+ /* SPR/HPR */
492
+ void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float* x, int incx, float* AP);
493
+ void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double* x, int incx, double* AP);
494
+ void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* AP);
495
+ void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP);
496
+ /*------------------------------------------------------------------------*/
497
+ /* SYR2/HER2 */
498
+ void CUBLASWINAPI
499
+ cublasSsyr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
500
+ void CUBLASWINAPI
501
+ cublasDsyr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
502
+ void CUBLASWINAPI cublasCher2(char uplo,
503
+ int n,
504
+ cuComplex alpha,
505
+ const cuComplex* x,
506
+ int incx,
507
+ const cuComplex* y,
508
+ int incy,
509
+ cuComplex* A,
510
+ int lda);
511
+ void CUBLASWINAPI cublasZher2(char uplo,
512
+ int n,
513
+ cuDoubleComplex alpha,
514
+ const cuDoubleComplex* x,
515
+ int incx,
516
+ const cuDoubleComplex* y,
517
+ int incy,
518
+ cuDoubleComplex* A,
519
+ int lda);
520
+
521
+ /*------------------------------------------------------------------------*/
522
+ /* SPR2/HPR2 */
523
+ void CUBLASWINAPI
524
+ cublasSspr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* AP);
525
+ void CUBLASWINAPI
526
+ cublasDspr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* AP);
527
+ void CUBLASWINAPI cublasChpr2(
528
+ char uplo, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP);
529
+ void CUBLASWINAPI cublasZhpr2(char uplo,
530
+ int n,
531
+ cuDoubleComplex alpha,
532
+ const cuDoubleComplex* x,
533
+ int incx,
534
+ const cuDoubleComplex* y,
535
+ int incy,
536
+ cuDoubleComplex* AP);
537
+ /* ------------------------BLAS3 Functions ------------------------------- */
538
+ /* GEMM */
539
+ void CUBLASWINAPI cublasSgemm(char transa,
540
+ char transb,
541
+ int m,
542
+ int n,
543
+ int k,
544
+ float alpha,
545
+ const float* A,
546
+ int lda,
547
+ const float* B,
548
+ int ldb,
549
+ float beta,
550
+ float* C,
551
+ int ldc);
552
+ void CUBLASWINAPI cublasDgemm(char transa,
553
+ char transb,
554
+ int m,
555
+ int n,
556
+ int k,
557
+ double alpha,
558
+ const double* A,
559
+ int lda,
560
+ const double* B,
561
+ int ldb,
562
+ double beta,
563
+ double* C,
564
+ int ldc);
565
+ void CUBLASWINAPI cublasCgemm(char transa,
566
+ char transb,
567
+ int m,
568
+ int n,
569
+ int k,
570
+ cuComplex alpha,
571
+ const cuComplex* A,
572
+ int lda,
573
+ const cuComplex* B,
574
+ int ldb,
575
+ cuComplex beta,
576
+ cuComplex* C,
577
+ int ldc);
578
+ void CUBLASWINAPI cublasZgemm(char transa,
579
+ char transb,
580
+ int m,
581
+ int n,
582
+ int k,
583
+ cuDoubleComplex alpha,
584
+ const cuDoubleComplex* A,
585
+ int lda,
586
+ const cuDoubleComplex* B,
587
+ int ldb,
588
+ cuDoubleComplex beta,
589
+ cuDoubleComplex* C,
590
+ int ldc);
591
+ /* -------------------------------------------------------*/
592
+ /* SYRK */
593
+ void CUBLASWINAPI
594
+ cublasSsyrk(char uplo, char trans, int n, int k, float alpha, const float* A, int lda, float beta, float* C, int ldc);
595
+ void CUBLASWINAPI cublasDsyrk(
596
+ char uplo, char trans, int n, int k, double alpha, const double* A, int lda, double beta, double* C, int ldc);
597
+
598
+ void CUBLASWINAPI cublasCsyrk(char uplo,
599
+ char trans,
600
+ int n,
601
+ int k,
602
+ cuComplex alpha,
603
+ const cuComplex* A,
604
+ int lda,
605
+ cuComplex beta,
606
+ cuComplex* C,
607
+ int ldc);
608
+ void CUBLASWINAPI cublasZsyrk(char uplo,
609
+ char trans,
610
+ int n,
611
+ int k,
612
+ cuDoubleComplex alpha,
613
+ const cuDoubleComplex* A,
614
+ int lda,
615
+ cuDoubleComplex beta,
616
+ cuDoubleComplex* C,
617
+ int ldc);
618
+ /* ------------------------------------------------------- */
619
+ /* HERK */
620
+ void CUBLASWINAPI cublasCherk(
621
+ char uplo, char trans, int n, int k, float alpha, const cuComplex* A, int lda, float beta, cuComplex* C, int ldc);
622
+ void CUBLASWINAPI cublasZherk(char uplo,
623
+ char trans,
624
+ int n,
625
+ int k,
626
+ double alpha,
627
+ const cuDoubleComplex* A,
628
+ int lda,
629
+ double beta,
630
+ cuDoubleComplex* C,
631
+ int ldc);
632
+ /* ------------------------------------------------------- */
633
+ /* SYR2K */
634
+ void CUBLASWINAPI cublasSsyr2k(char uplo,
635
+ char trans,
636
+ int n,
637
+ int k,
638
+ float alpha,
639
+ const float* A,
640
+ int lda,
641
+ const float* B,
642
+ int ldb,
643
+ float beta,
644
+ float* C,
645
+ int ldc);
646
+
647
+ void CUBLASWINAPI cublasDsyr2k(char uplo,
648
+ char trans,
649
+ int n,
650
+ int k,
651
+ double alpha,
652
+ const double* A,
653
+ int lda,
654
+ const double* B,
655
+ int ldb,
656
+ double beta,
657
+ double* C,
658
+ int ldc);
659
+ void CUBLASWINAPI cublasCsyr2k(char uplo,
660
+ char trans,
661
+ int n,
662
+ int k,
663
+ cuComplex alpha,
664
+ const cuComplex* A,
665
+ int lda,
666
+ const cuComplex* B,
667
+ int ldb,
668
+ cuComplex beta,
669
+ cuComplex* C,
670
+ int ldc);
671
+
672
+ void CUBLASWINAPI cublasZsyr2k(char uplo,
673
+ char trans,
674
+ int n,
675
+ int k,
676
+ cuDoubleComplex alpha,
677
+ const cuDoubleComplex* A,
678
+ int lda,
679
+ const cuDoubleComplex* B,
680
+ int ldb,
681
+ cuDoubleComplex beta,
682
+ cuDoubleComplex* C,
683
+ int ldc);
684
+ /* ------------------------------------------------------- */
685
+ /* HER2K */
686
+ void CUBLASWINAPI cublasCher2k(char uplo,
687
+ char trans,
688
+ int n,
689
+ int k,
690
+ cuComplex alpha,
691
+ const cuComplex* A,
692
+ int lda,
693
+ const cuComplex* B,
694
+ int ldb,
695
+ float beta,
696
+ cuComplex* C,
697
+ int ldc);
698
+
699
+ void CUBLASWINAPI cublasZher2k(char uplo,
700
+ char trans,
701
+ int n,
702
+ int k,
703
+ cuDoubleComplex alpha,
704
+ const cuDoubleComplex* A,
705
+ int lda,
706
+ const cuDoubleComplex* B,
707
+ int ldb,
708
+ double beta,
709
+ cuDoubleComplex* C,
710
+ int ldc);
711
+
712
+ /*------------------------------------------------------------------------*/
713
+ /* SYMM*/
714
+ void CUBLASWINAPI cublasSsymm(char side,
715
+ char uplo,
716
+ int m,
717
+ int n,
718
+ float alpha,
719
+ const float* A,
720
+ int lda,
721
+ const float* B,
722
+ int ldb,
723
+ float beta,
724
+ float* C,
725
+ int ldc);
726
+ void CUBLASWINAPI cublasDsymm(char side,
727
+ char uplo,
728
+ int m,
729
+ int n,
730
+ double alpha,
731
+ const double* A,
732
+ int lda,
733
+ const double* B,
734
+ int ldb,
735
+ double beta,
736
+ double* C,
737
+ int ldc);
738
+
739
+ void CUBLASWINAPI cublasCsymm(char side,
740
+ char uplo,
741
+ int m,
742
+ int n,
743
+ cuComplex alpha,
744
+ const cuComplex* A,
745
+ int lda,
746
+ const cuComplex* B,
747
+ int ldb,
748
+ cuComplex beta,
749
+ cuComplex* C,
750
+ int ldc);
751
+
752
+ void CUBLASWINAPI cublasZsymm(char side,
753
+ char uplo,
754
+ int m,
755
+ int n,
756
+ cuDoubleComplex alpha,
757
+ const cuDoubleComplex* A,
758
+ int lda,
759
+ const cuDoubleComplex* B,
760
+ int ldb,
761
+ cuDoubleComplex beta,
762
+ cuDoubleComplex* C,
763
+ int ldc);
764
+ /*------------------------------------------------------------------------*/
765
+ /* HEMM*/
766
+ void CUBLASWINAPI cublasChemm(char side,
767
+ char uplo,
768
+ int m,
769
+ int n,
770
+ cuComplex alpha,
771
+ const cuComplex* A,
772
+ int lda,
773
+ const cuComplex* B,
774
+ int ldb,
775
+ cuComplex beta,
776
+ cuComplex* C,
777
+ int ldc);
778
+ void CUBLASWINAPI cublasZhemm(char side,
779
+ char uplo,
780
+ int m,
781
+ int n,
782
+ cuDoubleComplex alpha,
783
+ const cuDoubleComplex* A,
784
+ int lda,
785
+ const cuDoubleComplex* B,
786
+ int ldb,
787
+ cuDoubleComplex beta,
788
+ cuDoubleComplex* C,
789
+ int ldc);
790
+
791
+ /*------------------------------------------------------------------------*/
792
+ /* TRSM*/
793
+ void CUBLASWINAPI cublasStrsm(char side,
794
+ char uplo,
795
+ char transa,
796
+ char diag,
797
+ int m,
798
+ int n,
799
+ float alpha,
800
+ const float* A,
801
+ int lda,
802
+ float* B,
803
+ int ldb);
804
+
805
+ void CUBLASWINAPI cublasDtrsm(char side,
806
+ char uplo,
807
+ char transa,
808
+ char diag,
809
+ int m,
810
+ int n,
811
+ double alpha,
812
+ const double* A,
813
+ int lda,
814
+ double* B,
815
+ int ldb);
816
+
817
+ void CUBLASWINAPI cublasCtrsm(char side,
818
+ char uplo,
819
+ char transa,
820
+ char diag,
821
+ int m,
822
+ int n,
823
+ cuComplex alpha,
824
+ const cuComplex* A,
825
+ int lda,
826
+ cuComplex* B,
827
+ int ldb);
828
+
829
+ void CUBLASWINAPI cublasZtrsm(char side,
830
+ char uplo,
831
+ char transa,
832
+ char diag,
833
+ int m,
834
+ int n,
835
+ cuDoubleComplex alpha,
836
+ const cuDoubleComplex* A,
837
+ int lda,
838
+ cuDoubleComplex* B,
839
+ int ldb);
840
+ /*------------------------------------------------------------------------*/
841
+ /* TRMM*/
842
+ void CUBLASWINAPI cublasStrmm(char side,
843
+ char uplo,
844
+ char transa,
845
+ char diag,
846
+ int m,
847
+ int n,
848
+ float alpha,
849
+ const float* A,
850
+ int lda,
851
+ float* B,
852
+ int ldb);
853
+ void CUBLASWINAPI cublasDtrmm(char side,
854
+ char uplo,
855
+ char transa,
856
+ char diag,
857
+ int m,
858
+ int n,
859
+ double alpha,
860
+ const double* A,
861
+ int lda,
862
+ double* B,
863
+ int ldb);
864
+ void CUBLASWINAPI cublasCtrmm(char side,
865
+ char uplo,
866
+ char transa,
867
+ char diag,
868
+ int m,
869
+ int n,
870
+ cuComplex alpha,
871
+ const cuComplex* A,
872
+ int lda,
873
+ cuComplex* B,
874
+ int ldb);
875
+ void CUBLASWINAPI cublasZtrmm(char side,
876
+ char uplo,
877
+ char transa,
878
+ char diag,
879
+ int m,
880
+ int n,
881
+ cuDoubleComplex alpha,
882
+ const cuDoubleComplex* A,
883
+ int lda,
884
+ cuDoubleComplex* B,
885
+ int ldb);
886
+
887
+ #if defined(__cplusplus)
888
+ }
889
+ #endif /* __cplusplus */
890
+
891
+ #endif /* !defined(CUBLAS_H_) */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h ADDED
@@ -0,0 +1,693 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /* cublasXt : Host API, Out of Core and Multi-GPU BLAS Library
51
+
52
+ */
53
+
54
+ #if !defined(CUBLAS_XT_H_)
55
+ #define CUBLAS_XT_H_
56
+
57
+ #include "driver_types.h"
58
+ #include "cuComplex.h" /* import complex data type */
59
+
60
+ #include "cublas_v2.h"
61
+
62
+ #if defined(__cplusplus)
63
+ extern "C" {
64
+ #endif /* __cplusplus */
65
+
66
+ struct cublasXtContext;
67
+ typedef struct cublasXtContext* cublasXtHandle_t;
68
+
69
+ cublasStatus_t CUBLASWINAPI cublasXtCreate(cublasXtHandle_t* handle);
70
+ cublasStatus_t CUBLASWINAPI cublasXtDestroy(cublasXtHandle_t handle);
71
+ cublasStatus_t CUBLASWINAPI cublasXtGetNumBoards(int nbDevices, int deviceId[], int* nbBoards);
72
+ cublasStatus_t CUBLASWINAPI cublasXtMaxBoards(int* nbGpuBoards);
73
+ /* This routine selects the Gpus that the user want to use for CUBLAS-XT */
74
+ cublasStatus_t CUBLASWINAPI cublasXtDeviceSelect(cublasXtHandle_t handle, int nbDevices, int deviceId[]);
75
+
76
+ /* This routine allows to change the dimension of the tiles ( blockDim x blockDim ) */
77
+ cublasStatus_t CUBLASWINAPI cublasXtSetBlockDim(cublasXtHandle_t handle, int blockDim);
78
+ cublasStatus_t CUBLASWINAPI cublasXtGetBlockDim(cublasXtHandle_t handle, int* blockDim);
79
+
80
+ typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t;
81
+ /* This routine allows to CUBLAS-XT to pin the Host memory if it find out that some of the matrix passed
82
+ are not pinned : Pinning/Unpinning the Host memory is still a costly operation
83
+ It is better if the user controls the memory on its own (by pinning/unpinning oly when necessary)
84
+ */
85
+ cublasStatus_t CUBLASWINAPI cublasXtGetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t* mode);
86
+ cublasStatus_t CUBLASWINAPI cublasXtSetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t mode);
87
+
88
+ /* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */
89
+ typedef enum {
90
+ CUBLASXT_FLOAT = 0,
91
+ CUBLASXT_DOUBLE = 1,
92
+ CUBLASXT_COMPLEX = 2,
93
+ CUBLASXT_DOUBLECOMPLEX = 3,
94
+ } cublasXtOpType_t;
95
+
96
+ typedef enum {
97
+ CUBLASXT_GEMM = 0,
98
+ CUBLASXT_SYRK = 1,
99
+ CUBLASXT_HERK = 2,
100
+ CUBLASXT_SYMM = 3,
101
+ CUBLASXT_HEMM = 4,
102
+ CUBLASXT_TRSM = 5,
103
+ CUBLASXT_SYR2K = 6,
104
+ CUBLASXT_HER2K = 7,
105
+
106
+ CUBLASXT_SPMM = 8,
107
+ CUBLASXT_SYRKX = 9,
108
+ CUBLASXT_HERKX = 10,
109
+ CUBLASXT_TRMM = 11,
110
+ CUBLASXT_ROUTINE_MAX = 12,
111
+ } cublasXtBlasOp_t;
112
+
113
+ /* Currently only 32-bit integer BLAS routines are supported */
114
+ cublasStatus_t CUBLASWINAPI cublasXtSetCpuRoutine(cublasXtHandle_t handle,
115
+ cublasXtBlasOp_t blasOp,
116
+ cublasXtOpType_t type,
117
+ void* blasFunctor);
118
+
119
+ /* Specified the percentage of work that should done by the CPU, default is 0 (no work) */
120
+ cublasStatus_t CUBLASWINAPI cublasXtSetCpuRatio(cublasXtHandle_t handle,
121
+ cublasXtBlasOp_t blasOp,
122
+ cublasXtOpType_t type,
123
+ float ratio);
124
+
125
+ /* GEMM */
126
+ cublasStatus_t CUBLASWINAPI cublasXtSgemm(cublasXtHandle_t handle,
127
+ cublasOperation_t transa,
128
+ cublasOperation_t transb,
129
+ size_t m,
130
+ size_t n,
131
+ size_t k,
132
+ const float* alpha,
133
+ const float* A,
134
+ size_t lda,
135
+ const float* B,
136
+ size_t ldb,
137
+ const float* beta,
138
+ float* C,
139
+ size_t ldc);
140
+
141
+ cublasStatus_t CUBLASWINAPI cublasXtDgemm(cublasXtHandle_t handle,
142
+ cublasOperation_t transa,
143
+ cublasOperation_t transb,
144
+ size_t m,
145
+ size_t n,
146
+ size_t k,
147
+ const double* alpha,
148
+ const double* A,
149
+ size_t lda,
150
+ const double* B,
151
+ size_t ldb,
152
+ const double* beta,
153
+ double* C,
154
+ size_t ldc);
155
+
156
+ cublasStatus_t CUBLASWINAPI cublasXtCgemm(cublasXtHandle_t handle,
157
+ cublasOperation_t transa,
158
+ cublasOperation_t transb,
159
+ size_t m,
160
+ size_t n,
161
+ size_t k,
162
+ const cuComplex* alpha,
163
+ const cuComplex* A,
164
+ size_t lda,
165
+ const cuComplex* B,
166
+ size_t ldb,
167
+ const cuComplex* beta,
168
+ cuComplex* C,
169
+ size_t ldc);
170
+
171
+ cublasStatus_t CUBLASWINAPI cublasXtZgemm(cublasXtHandle_t handle,
172
+ cublasOperation_t transa,
173
+ cublasOperation_t transb,
174
+ size_t m,
175
+ size_t n,
176
+ size_t k,
177
+ const cuDoubleComplex* alpha,
178
+ const cuDoubleComplex* A,
179
+ size_t lda,
180
+ const cuDoubleComplex* B,
181
+ size_t ldb,
182
+ const cuDoubleComplex* beta,
183
+ cuDoubleComplex* C,
184
+ size_t ldc);
185
+ /* ------------------------------------------------------- */
186
+ /* SYRK */
187
+ cublasStatus_t CUBLASWINAPI cublasXtSsyrk(cublasXtHandle_t handle,
188
+ cublasFillMode_t uplo,
189
+ cublasOperation_t trans,
190
+ size_t n,
191
+ size_t k,
192
+ const float* alpha,
193
+ const float* A,
194
+ size_t lda,
195
+ const float* beta,
196
+ float* C,
197
+ size_t ldc);
198
+
199
+ cublasStatus_t CUBLASWINAPI cublasXtDsyrk(cublasXtHandle_t handle,
200
+ cublasFillMode_t uplo,
201
+ cublasOperation_t trans,
202
+ size_t n,
203
+ size_t k,
204
+ const double* alpha,
205
+ const double* A,
206
+ size_t lda,
207
+ const double* beta,
208
+ double* C,
209
+ size_t ldc);
210
+
211
+ cublasStatus_t CUBLASWINAPI cublasXtCsyrk(cublasXtHandle_t handle,
212
+ cublasFillMode_t uplo,
213
+ cublasOperation_t trans,
214
+ size_t n,
215
+ size_t k,
216
+ const cuComplex* alpha,
217
+ const cuComplex* A,
218
+ size_t lda,
219
+ const cuComplex* beta,
220
+ cuComplex* C,
221
+ size_t ldc);
222
+
223
+ cublasStatus_t CUBLASWINAPI cublasXtZsyrk(cublasXtHandle_t handle,
224
+ cublasFillMode_t uplo,
225
+ cublasOperation_t trans,
226
+ size_t n,
227
+ size_t k,
228
+ const cuDoubleComplex* alpha,
229
+ const cuDoubleComplex* A,
230
+ size_t lda,
231
+ const cuDoubleComplex* beta,
232
+ cuDoubleComplex* C,
233
+ size_t ldc);
234
+ /* -------------------------------------------------------------------- */
235
+ /* HERK */
236
+ cublasStatus_t CUBLASWINAPI cublasXtCherk(cublasXtHandle_t handle,
237
+ cublasFillMode_t uplo,
238
+ cublasOperation_t trans,
239
+ size_t n,
240
+ size_t k,
241
+ const float* alpha,
242
+ const cuComplex* A,
243
+ size_t lda,
244
+ const float* beta,
245
+ cuComplex* C,
246
+ size_t ldc);
247
+
248
+ cublasStatus_t CUBLASWINAPI cublasXtZherk(cublasXtHandle_t handle,
249
+ cublasFillMode_t uplo,
250
+ cublasOperation_t trans,
251
+ size_t n,
252
+ size_t k,
253
+ const double* alpha,
254
+ const cuDoubleComplex* A,
255
+ size_t lda,
256
+ const double* beta,
257
+ cuDoubleComplex* C,
258
+ size_t ldc);
259
+ /* -------------------------------------------------------------------- */
260
+ /* SYR2K */
261
+ cublasStatus_t CUBLASWINAPI cublasXtSsyr2k(cublasXtHandle_t handle,
262
+ cublasFillMode_t uplo,
263
+ cublasOperation_t trans,
264
+ size_t n,
265
+ size_t k,
266
+ const float* alpha,
267
+ const float* A,
268
+ size_t lda,
269
+ const float* B,
270
+ size_t ldb,
271
+ const float* beta,
272
+ float* C,
273
+ size_t ldc);
274
+
275
+ cublasStatus_t CUBLASWINAPI cublasXtDsyr2k(cublasXtHandle_t handle,
276
+ cublasFillMode_t uplo,
277
+ cublasOperation_t trans,
278
+ size_t n,
279
+ size_t k,
280
+ const double* alpha,
281
+ const double* A,
282
+ size_t lda,
283
+ const double* B,
284
+ size_t ldb,
285
+ const double* beta,
286
+ double* C,
287
+ size_t ldc);
288
+
289
+ cublasStatus_t CUBLASWINAPI cublasXtCsyr2k(cublasXtHandle_t handle,
290
+ cublasFillMode_t uplo,
291
+ cublasOperation_t trans,
292
+ size_t n,
293
+ size_t k,
294
+ const cuComplex* alpha,
295
+ const cuComplex* A,
296
+ size_t lda,
297
+ const cuComplex* B,
298
+ size_t ldb,
299
+ const cuComplex* beta,
300
+ cuComplex* C,
301
+ size_t ldc);
302
+
303
+ cublasStatus_t CUBLASWINAPI cublasXtZsyr2k(cublasXtHandle_t handle,
304
+ cublasFillMode_t uplo,
305
+ cublasOperation_t trans,
306
+ size_t n,
307
+ size_t k,
308
+ const cuDoubleComplex* alpha,
309
+ const cuDoubleComplex* A,
310
+ size_t lda,
311
+ const cuDoubleComplex* B,
312
+ size_t ldb,
313
+ const cuDoubleComplex* beta,
314
+ cuDoubleComplex* C,
315
+ size_t ldc);
316
+ /* -------------------------------------------------------------------- */
317
+ /* HERKX : variant extension of HERK */
318
+ cublasStatus_t CUBLASWINAPI cublasXtCherkx(cublasXtHandle_t handle,
319
+ cublasFillMode_t uplo,
320
+ cublasOperation_t trans,
321
+ size_t n,
322
+ size_t k,
323
+ const cuComplex* alpha,
324
+ const cuComplex* A,
325
+ size_t lda,
326
+ const cuComplex* B,
327
+ size_t ldb,
328
+ const float* beta,
329
+ cuComplex* C,
330
+ size_t ldc);
331
+
332
+ cublasStatus_t CUBLASWINAPI cublasXtZherkx(cublasXtHandle_t handle,
333
+ cublasFillMode_t uplo,
334
+ cublasOperation_t trans,
335
+ size_t n,
336
+ size_t k,
337
+ const cuDoubleComplex* alpha,
338
+ const cuDoubleComplex* A,
339
+ size_t lda,
340
+ const cuDoubleComplex* B,
341
+ size_t ldb,
342
+ const double* beta,
343
+ cuDoubleComplex* C,
344
+ size_t ldc);
345
+
346
+ /* -------------------------------------------------------------------- */
347
+ /* TRSM */
348
+ cublasStatus_t CUBLASWINAPI cublasXtStrsm(cublasXtHandle_t handle,
349
+ cublasSideMode_t side,
350
+ cublasFillMode_t uplo,
351
+ cublasOperation_t trans,
352
+ cublasDiagType_t diag,
353
+ size_t m,
354
+ size_t n,
355
+ const float* alpha,
356
+ const float* A,
357
+ size_t lda,
358
+ float* B,
359
+ size_t ldb);
360
+
361
+ cublasStatus_t CUBLASWINAPI cublasXtDtrsm(cublasXtHandle_t handle,
362
+ cublasSideMode_t side,
363
+ cublasFillMode_t uplo,
364
+ cublasOperation_t trans,
365
+ cublasDiagType_t diag,
366
+ size_t m,
367
+ size_t n,
368
+ const double* alpha,
369
+ const double* A,
370
+ size_t lda,
371
+ double* B,
372
+ size_t ldb);
373
+
374
+ cublasStatus_t CUBLASWINAPI cublasXtCtrsm(cublasXtHandle_t handle,
375
+ cublasSideMode_t side,
376
+ cublasFillMode_t uplo,
377
+ cublasOperation_t trans,
378
+ cublasDiagType_t diag,
379
+ size_t m,
380
+ size_t n,
381
+ const cuComplex* alpha,
382
+ const cuComplex* A,
383
+ size_t lda,
384
+ cuComplex* B,
385
+ size_t ldb);
386
+
387
+ cublasStatus_t CUBLASWINAPI cublasXtZtrsm(cublasXtHandle_t handle,
388
+ cublasSideMode_t side,
389
+ cublasFillMode_t uplo,
390
+ cublasOperation_t trans,
391
+ cublasDiagType_t diag,
392
+ size_t m,
393
+ size_t n,
394
+ const cuDoubleComplex* alpha,
395
+ const cuDoubleComplex* A,
396
+ size_t lda,
397
+ cuDoubleComplex* B,
398
+ size_t ldb);
399
+ /* -------------------------------------------------------------------- */
400
+ /* SYMM : Symmetric Multiply Matrix*/
401
+ cublasStatus_t CUBLASWINAPI cublasXtSsymm(cublasXtHandle_t handle,
402
+ cublasSideMode_t side,
403
+ cublasFillMode_t uplo,
404
+ size_t m,
405
+ size_t n,
406
+ const float* alpha,
407
+ const float* A,
408
+ size_t lda,
409
+ const float* B,
410
+ size_t ldb,
411
+ const float* beta,
412
+ float* C,
413
+ size_t ldc);
414
+
415
+ cublasStatus_t CUBLASWINAPI cublasXtDsymm(cublasXtHandle_t handle,
416
+ cublasSideMode_t side,
417
+ cublasFillMode_t uplo,
418
+ size_t m,
419
+ size_t n,
420
+ const double* alpha,
421
+ const double* A,
422
+ size_t lda,
423
+ const double* B,
424
+ size_t ldb,
425
+ const double* beta,
426
+ double* C,
427
+ size_t ldc);
428
+
429
+ cublasStatus_t CUBLASWINAPI cublasXtCsymm(cublasXtHandle_t handle,
430
+ cublasSideMode_t side,
431
+ cublasFillMode_t uplo,
432
+ size_t m,
433
+ size_t n,
434
+ const cuComplex* alpha,
435
+ const cuComplex* A,
436
+ size_t lda,
437
+ const cuComplex* B,
438
+ size_t ldb,
439
+ const cuComplex* beta,
440
+ cuComplex* C,
441
+ size_t ldc);
442
+
443
+ cublasStatus_t CUBLASWINAPI cublasXtZsymm(cublasXtHandle_t handle,
444
+ cublasSideMode_t side,
445
+ cublasFillMode_t uplo,
446
+ size_t m,
447
+ size_t n,
448
+ const cuDoubleComplex* alpha,
449
+ const cuDoubleComplex* A,
450
+ size_t lda,
451
+ const cuDoubleComplex* B,
452
+ size_t ldb,
453
+ const cuDoubleComplex* beta,
454
+ cuDoubleComplex* C,
455
+ size_t ldc);
456
+ /* -------------------------------------------------------------------- */
457
+ /* HEMM : Hermitian Matrix Multiply */
458
+ cublasStatus_t CUBLASWINAPI cublasXtChemm(cublasXtHandle_t handle,
459
+ cublasSideMode_t side,
460
+ cublasFillMode_t uplo,
461
+ size_t m,
462
+ size_t n,
463
+ const cuComplex* alpha,
464
+ const cuComplex* A,
465
+ size_t lda,
466
+ const cuComplex* B,
467
+ size_t ldb,
468
+ const cuComplex* beta,
469
+ cuComplex* C,
470
+ size_t ldc);
471
+
472
+ cublasStatus_t CUBLASWINAPI cublasXtZhemm(cublasXtHandle_t handle,
473
+ cublasSideMode_t side,
474
+ cublasFillMode_t uplo,
475
+ size_t m,
476
+ size_t n,
477
+ const cuDoubleComplex* alpha,
478
+ const cuDoubleComplex* A,
479
+ size_t lda,
480
+ const cuDoubleComplex* B,
481
+ size_t ldb,
482
+ const cuDoubleComplex* beta,
483
+ cuDoubleComplex* C,
484
+ size_t ldc);
485
+
486
+ /* -------------------------------------------------------------------- */
487
+ /* SYRKX : variant extension of SYRK */
488
+ cublasStatus_t CUBLASWINAPI cublasXtSsyrkx(cublasXtHandle_t handle,
489
+ cublasFillMode_t uplo,
490
+ cublasOperation_t trans,
491
+ size_t n,
492
+ size_t k,
493
+ const float* alpha,
494
+ const float* A,
495
+ size_t lda,
496
+ const float* B,
497
+ size_t ldb,
498
+ const float* beta,
499
+ float* C,
500
+ size_t ldc);
501
+
502
+ cublasStatus_t CUBLASWINAPI cublasXtDsyrkx(cublasXtHandle_t handle,
503
+ cublasFillMode_t uplo,
504
+ cublasOperation_t trans,
505
+ size_t n,
506
+ size_t k,
507
+ const double* alpha,
508
+ const double* A,
509
+ size_t lda,
510
+ const double* B,
511
+ size_t ldb,
512
+ const double* beta,
513
+ double* C,
514
+ size_t ldc);
515
+
516
+ cublasStatus_t CUBLASWINAPI cublasXtCsyrkx(cublasXtHandle_t handle,
517
+ cublasFillMode_t uplo,
518
+ cublasOperation_t trans,
519
+ size_t n,
520
+ size_t k,
521
+ const cuComplex* alpha,
522
+ const cuComplex* A,
523
+ size_t lda,
524
+ const cuComplex* B,
525
+ size_t ldb,
526
+ const cuComplex* beta,
527
+ cuComplex* C,
528
+ size_t ldc);
529
+
530
+ cublasStatus_t CUBLASWINAPI cublasXtZsyrkx(cublasXtHandle_t handle,
531
+ cublasFillMode_t uplo,
532
+ cublasOperation_t trans,
533
+ size_t n,
534
+ size_t k,
535
+ const cuDoubleComplex* alpha,
536
+ const cuDoubleComplex* A,
537
+ size_t lda,
538
+ const cuDoubleComplex* B,
539
+ size_t ldb,
540
+ const cuDoubleComplex* beta,
541
+ cuDoubleComplex* C,
542
+ size_t ldc);
543
+ /* -------------------------------------------------------------------- */
544
+ /* HER2K : variant extension of HERK */
545
+ cublasStatus_t CUBLASWINAPI cublasXtCher2k(cublasXtHandle_t handle,
546
+ cublasFillMode_t uplo,
547
+ cublasOperation_t trans,
548
+ size_t n,
549
+ size_t k,
550
+ const cuComplex* alpha,
551
+ const cuComplex* A,
552
+ size_t lda,
553
+ const cuComplex* B,
554
+ size_t ldb,
555
+ const float* beta,
556
+ cuComplex* C,
557
+ size_t ldc);
558
+
559
+ cublasStatus_t CUBLASWINAPI cublasXtZher2k(cublasXtHandle_t handle,
560
+ cublasFillMode_t uplo,
561
+ cublasOperation_t trans,
562
+ size_t n,
563
+ size_t k,
564
+ const cuDoubleComplex* alpha,
565
+ const cuDoubleComplex* A,
566
+ size_t lda,
567
+ const cuDoubleComplex* B,
568
+ size_t ldb,
569
+ const double* beta,
570
+ cuDoubleComplex* C,
571
+ size_t ldc);
572
+
573
+ /* -------------------------------------------------------------------- */
574
+ /* SPMM : Symmetric Packed Multiply Matrix*/
575
+ cublasStatus_t CUBLASWINAPI cublasXtSspmm(cublasXtHandle_t handle,
576
+ cublasSideMode_t side,
577
+ cublasFillMode_t uplo,
578
+ size_t m,
579
+ size_t n,
580
+ const float* alpha,
581
+ const float* AP,
582
+ const float* B,
583
+ size_t ldb,
584
+ const float* beta,
585
+ float* C,
586
+ size_t ldc);
587
+
588
+ cublasStatus_t CUBLASWINAPI cublasXtDspmm(cublasXtHandle_t handle,
589
+ cublasSideMode_t side,
590
+ cublasFillMode_t uplo,
591
+ size_t m,
592
+ size_t n,
593
+ const double* alpha,
594
+ const double* AP,
595
+ const double* B,
596
+ size_t ldb,
597
+ const double* beta,
598
+ double* C,
599
+ size_t ldc);
600
+
601
+ cublasStatus_t CUBLASWINAPI cublasXtCspmm(cublasXtHandle_t handle,
602
+ cublasSideMode_t side,
603
+ cublasFillMode_t uplo,
604
+ size_t m,
605
+ size_t n,
606
+ const cuComplex* alpha,
607
+ const cuComplex* AP,
608
+ const cuComplex* B,
609
+ size_t ldb,
610
+ const cuComplex* beta,
611
+ cuComplex* C,
612
+ size_t ldc);
613
+
614
+ cublasStatus_t CUBLASWINAPI cublasXtZspmm(cublasXtHandle_t handle,
615
+ cublasSideMode_t side,
616
+ cublasFillMode_t uplo,
617
+ size_t m,
618
+ size_t n,
619
+ const cuDoubleComplex* alpha,
620
+ const cuDoubleComplex* AP,
621
+ const cuDoubleComplex* B,
622
+ size_t ldb,
623
+ const cuDoubleComplex* beta,
624
+ cuDoubleComplex* C,
625
+ size_t ldc);
626
+
627
+ /* -------------------------------------------------------------------- */
628
+ /* TRMM */
629
+ cublasStatus_t CUBLASWINAPI cublasXtStrmm(cublasXtHandle_t handle,
630
+ cublasSideMode_t side,
631
+ cublasFillMode_t uplo,
632
+ cublasOperation_t trans,
633
+ cublasDiagType_t diag,
634
+ size_t m,
635
+ size_t n,
636
+ const float* alpha,
637
+ const float* A,
638
+ size_t lda,
639
+ const float* B,
640
+ size_t ldb,
641
+ float* C,
642
+ size_t ldc);
643
+
644
+ cublasStatus_t CUBLASWINAPI cublasXtDtrmm(cublasXtHandle_t handle,
645
+ cublasSideMode_t side,
646
+ cublasFillMode_t uplo,
647
+ cublasOperation_t trans,
648
+ cublasDiagType_t diag,
649
+ size_t m,
650
+ size_t n,
651
+ const double* alpha,
652
+ const double* A,
653
+ size_t lda,
654
+ const double* B,
655
+ size_t ldb,
656
+ double* C,
657
+ size_t ldc);
658
+
659
+ cublasStatus_t CUBLASWINAPI cublasXtCtrmm(cublasXtHandle_t handle,
660
+ cublasSideMode_t side,
661
+ cublasFillMode_t uplo,
662
+ cublasOperation_t trans,
663
+ cublasDiagType_t diag,
664
+ size_t m,
665
+ size_t n,
666
+ const cuComplex* alpha,
667
+ const cuComplex* A,
668
+ size_t lda,
669
+ const cuComplex* B,
670
+ size_t ldb,
671
+ cuComplex* C,
672
+ size_t ldc);
673
+
674
+ cublasStatus_t CUBLASWINAPI cublasXtZtrmm(cublasXtHandle_t handle,
675
+ cublasSideMode_t side,
676
+ cublasFillMode_t uplo,
677
+ cublasOperation_t trans,
678
+ cublasDiagType_t diag,
679
+ size_t m,
680
+ size_t n,
681
+ const cuDoubleComplex* alpha,
682
+ const cuDoubleComplex* A,
683
+ size_t lda,
684
+ const cuDoubleComplex* B,
685
+ size_t ldb,
686
+ cuDoubleComplex* C,
687
+ size_t ldc);
688
+
689
+ #if defined(__cplusplus)
690
+ }
691
+ #endif /* __cplusplus */
692
+
693
+ #endif /* !defined(CUBLAS_XT_H_) */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h ADDED
@@ -0,0 +1,824 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(NVBLAS_H_)
51
+ #define NVBLAS_H_
52
+
53
+ #include "driver_types.h"
54
+ #include "cuComplex.h" /* import complex data type */
55
+
56
+ #if defined(__cplusplus)
57
+ extern "C" {
58
+ #endif
59
+
60
+ /* GEMM */
61
+ void sgemm_(const char* transa,
62
+ const char* transb,
63
+ const int* m,
64
+ const int* n,
65
+ const int* k,
66
+ const float* alpha,
67
+ const float* a,
68
+ const int* lda,
69
+ const float* b,
70
+ const int* ldb,
71
+ const float* beta,
72
+ float* c,
73
+ const int* ldc);
74
+
75
+ void dgemm_(const char* transa,
76
+ const char* transb,
77
+ const int* m,
78
+ const int* n,
79
+ const int* k,
80
+ const double* alpha,
81
+ const double* a,
82
+ const int* lda,
83
+ const double* b,
84
+ const int* ldb,
85
+ const double* beta,
86
+ double* c,
87
+ const int* ldc);
88
+
89
+ void cgemm_(const char* transa,
90
+ const char* transb,
91
+ const int* m,
92
+ const int* n,
93
+ const int* k,
94
+ const cuComplex* alpha,
95
+ const cuComplex* a,
96
+ const int* lda,
97
+ const cuComplex* b,
98
+ const int* ldb,
99
+ const cuComplex* beta,
100
+ cuComplex* c,
101
+ const int* ldc);
102
+
103
+ void zgemm_(const char* transa,
104
+ const char* transb,
105
+ const int* m,
106
+ const int* n,
107
+ const int* k,
108
+ const cuDoubleComplex* alpha,
109
+ const cuDoubleComplex* a,
110
+ const int* lda,
111
+ const cuDoubleComplex* b,
112
+ const int* ldb,
113
+ const cuDoubleComplex* beta,
114
+ cuDoubleComplex* c,
115
+ const int* ldc);
116
+
117
+ void sgemm(const char* transa,
118
+ const char* transb,
119
+ const int* m,
120
+ const int* n,
121
+ const int* k,
122
+ const float* alpha,
123
+ const float* a,
124
+ const int* lda,
125
+ const float* b,
126
+ const int* ldb,
127
+ const float* beta,
128
+ float* c,
129
+ const int* ldc);
130
+
131
+ void dgemm(const char* transa,
132
+ const char* transb,
133
+ const int* m,
134
+ const int* n,
135
+ const int* k,
136
+ const double* alpha,
137
+ const double* a,
138
+ const int* lda,
139
+ const double* b,
140
+ const int* ldb,
141
+ const double* beta,
142
+ double* c,
143
+ const int* ldc);
144
+
145
+ void cgemm(const char* transa,
146
+ const char* transb,
147
+ const int* m,
148
+ const int* n,
149
+ const int* k,
150
+ const cuComplex* alpha,
151
+ const cuComplex* a,
152
+ const int* lda,
153
+ const cuComplex* b,
154
+ const int* ldb,
155
+ const cuComplex* beta,
156
+ cuComplex* c,
157
+ const int* ldc);
158
+
159
+ void zgemm(const char* transa,
160
+ const char* transb,
161
+ const int* m,
162
+ const int* n,
163
+ const int* k,
164
+ const cuDoubleComplex* alpha,
165
+ const cuDoubleComplex* a,
166
+ const int* lda,
167
+ const cuDoubleComplex* b,
168
+ const int* ldb,
169
+ const cuDoubleComplex* beta,
170
+ cuDoubleComplex* c,
171
+ const int* ldc);
172
+
173
+ /* SYRK */
174
+ void ssyrk_(const char* uplo,
175
+ const char* trans,
176
+ const int* n,
177
+ const int* k,
178
+ const float* alpha,
179
+ const float* a,
180
+ const int* lda,
181
+ const float* beta,
182
+ float* c,
183
+ const int* ldc);
184
+
185
+ void dsyrk_(const char* uplo,
186
+ const char* trans,
187
+ const int* n,
188
+ const int* k,
189
+ const double* alpha,
190
+ const double* a,
191
+ const int* lda,
192
+ const double* beta,
193
+ double* c,
194
+ const int* ldc);
195
+
196
+ void csyrk_(const char* uplo,
197
+ const char* trans,
198
+ const int* n,
199
+ const int* k,
200
+ const cuComplex* alpha,
201
+ const cuComplex* a,
202
+ const int* lda,
203
+ const cuComplex* beta,
204
+ cuComplex* c,
205
+ const int* ldc);
206
+
207
+ void zsyrk_(const char* uplo,
208
+ const char* trans,
209
+ const int* n,
210
+ const int* k,
211
+ const cuDoubleComplex* alpha,
212
+ const cuDoubleComplex* a,
213
+ const int* lda,
214
+ const cuDoubleComplex* beta,
215
+ cuDoubleComplex* c,
216
+ const int* ldc);
217
+
218
+ void ssyrk(const char* uplo,
219
+ const char* trans,
220
+ const int* n,
221
+ const int* k,
222
+ const float* alpha,
223
+ const float* a,
224
+ const int* lda,
225
+ const float* beta,
226
+ float* c,
227
+ const int* ldc);
228
+
229
+ void dsyrk(const char* uplo,
230
+ const char* trans,
231
+ const int* n,
232
+ const int* k,
233
+ const double* alpha,
234
+ const double* a,
235
+ const int* lda,
236
+ const double* beta,
237
+ double* c,
238
+ const int* ldc);
239
+
240
+ void csyrk(const char* uplo,
241
+ const char* trans,
242
+ const int* n,
243
+ const int* k,
244
+ const cuComplex* alpha,
245
+ const cuComplex* a,
246
+ const int* lda,
247
+ const cuComplex* beta,
248
+ cuComplex* c,
249
+ const int* ldc);
250
+
251
+ void zsyrk(const char* uplo,
252
+ const char* trans,
253
+ const int* n,
254
+ const int* k,
255
+ const cuDoubleComplex* alpha,
256
+ const cuDoubleComplex* a,
257
+ const int* lda,
258
+ const cuDoubleComplex* beta,
259
+ cuDoubleComplex* c,
260
+ const int* ldc);
261
+
262
+ /* HERK */
263
+ void cherk_(const char* uplo,
264
+ const char* trans,
265
+ const int* n,
266
+ const int* k,
267
+ const float* alpha,
268
+ const cuComplex* a,
269
+ const int* lda,
270
+ const float* beta,
271
+ cuComplex* c,
272
+ const int* ldc);
273
+
274
+ void zherk_(const char* uplo,
275
+ const char* trans,
276
+ const int* n,
277
+ const int* k,
278
+ const double* alpha,
279
+ const cuDoubleComplex* a,
280
+ const int* lda,
281
+ const double* beta,
282
+ cuDoubleComplex* c,
283
+ const int* ldc);
284
+
285
+ void cherk(const char* uplo,
286
+ const char* trans,
287
+ const int* n,
288
+ const int* k,
289
+ const float* alpha,
290
+ const cuComplex* a,
291
+ const int* lda,
292
+ const float* beta,
293
+ cuComplex* c,
294
+ const int* ldc);
295
+
296
+ void zherk(const char* uplo,
297
+ const char* trans,
298
+ const int* n,
299
+ const int* k,
300
+ const double* alpha,
301
+ const cuDoubleComplex* a,
302
+ const int* lda,
303
+ const double* beta,
304
+ cuDoubleComplex* c,
305
+ const int* ldc);
306
+
307
+ /* TRSM */
308
+ void strsm_(const char* side,
309
+ const char* uplo,
310
+ const char* transa,
311
+ const char* diag,
312
+ const int* m,
313
+ const int* n,
314
+ const float* alpha,
315
+ const float* a,
316
+ const int* lda,
317
+ float* b,
318
+ const int* ldb);
319
+
320
+ void dtrsm_(const char* side,
321
+ const char* uplo,
322
+ const char* transa,
323
+ const char* diag,
324
+ const int* m,
325
+ const int* n,
326
+ const double* alpha,
327
+ const double* a,
328
+ const int* lda,
329
+ double* b,
330
+ const int* ldb);
331
+
332
+ void ctrsm_(const char* side,
333
+ const char* uplo,
334
+ const char* transa,
335
+ const char* diag,
336
+ const int* m,
337
+ const int* n,
338
+ const cuComplex* alpha,
339
+ const cuComplex* a,
340
+ const int* lda,
341
+ cuComplex* b,
342
+ const int* ldb);
343
+
344
+ void ztrsm_(const char* side,
345
+ const char* uplo,
346
+ const char* transa,
347
+ const char* diag,
348
+ const int* m,
349
+ const int* n,
350
+ const cuDoubleComplex* alpha,
351
+ const cuDoubleComplex* a,
352
+ const int* lda,
353
+ cuDoubleComplex* b,
354
+ const int* ldb);
355
+
356
+ void strsm(const char* side,
357
+ const char* uplo,
358
+ const char* transa,
359
+ const char* diag,
360
+ const int* m,
361
+ const int* n,
362
+ const float* alpha,
363
+ const float* a,
364
+ const int* lda,
365
+ float* b,
366
+ const int* ldb);
367
+
368
+ void dtrsm(const char* side,
369
+ const char* uplo,
370
+ const char* transa,
371
+ const char* diag,
372
+ const int* m,
373
+ const int* n,
374
+ const double* alpha,
375
+ const double* a,
376
+ const int* lda,
377
+ double* b,
378
+ const int* ldb);
379
+
380
+ void ctrsm(const char* side,
381
+ const char* uplo,
382
+ const char* transa,
383
+ const char* diag,
384
+ const int* m,
385
+ const int* n,
386
+ const cuComplex* alpha,
387
+ const cuComplex* a,
388
+ const int* lda,
389
+ cuComplex* b,
390
+ const int* ldb);
391
+
392
+ void ztrsm(const char* side,
393
+ const char* uplo,
394
+ const char* transa,
395
+ const char* diag,
396
+ const int* m,
397
+ const int* n,
398
+ const cuDoubleComplex* alpha,
399
+ const cuDoubleComplex* a,
400
+ const int* lda,
401
+ cuDoubleComplex* b,
402
+ const int* ldb);
403
+
404
+ /* SYMM */
405
+ void ssymm_(const char* side,
406
+ const char* uplo,
407
+ const int* m,
408
+ const int* n,
409
+ const float* alpha,
410
+ const float* a,
411
+ const int* lda,
412
+ const float* b,
413
+ const int* ldb,
414
+ const float* beta,
415
+ float* c,
416
+ const int* ldc);
417
+
418
+ void dsymm_(const char* side,
419
+ const char* uplo,
420
+ const int* m,
421
+ const int* n,
422
+ const double* alpha,
423
+ const double* a,
424
+ const int* lda,
425
+ const double* b,
426
+ const int* ldb,
427
+ const double* beta,
428
+ double* c,
429
+ const int* ldc);
430
+
431
+ void csymm_(const char* side,
432
+ const char* uplo,
433
+ const int* m,
434
+ const int* n,
435
+ const cuComplex* alpha,
436
+ const cuComplex* a,
437
+ const int* lda,
438
+ const cuComplex* b,
439
+ const int* ldb,
440
+ const cuComplex* beta,
441
+ cuComplex* c,
442
+ const int* ldc);
443
+
444
+ void zsymm_(const char* side,
445
+ const char* uplo,
446
+ const int* m,
447
+ const int* n,
448
+ const cuDoubleComplex* alpha,
449
+ const cuDoubleComplex* a,
450
+ const int* lda,
451
+ const cuDoubleComplex* b,
452
+ const int* ldb,
453
+ const cuDoubleComplex* beta,
454
+ cuDoubleComplex* c,
455
+ const int* ldc);
456
+
457
+ void ssymm(const char* side,
458
+ const char* uplo,
459
+ const int* m,
460
+ const int* n,
461
+ const float* alpha,
462
+ const float* a,
463
+ const int* lda,
464
+ const float* b,
465
+ const int* ldb,
466
+ const float* beta,
467
+ float* c,
468
+ const int* ldc);
469
+
470
+ void dsymm(const char* side,
471
+ const char* uplo,
472
+ const int* m,
473
+ const int* n,
474
+ const double* alpha,
475
+ const double* a,
476
+ const int* lda,
477
+ const double* b,
478
+ const int* ldb,
479
+ const double* beta,
480
+ double* c,
481
+ const int* ldc);
482
+
483
+ void csymm(const char* side,
484
+ const char* uplo,
485
+ const int* m,
486
+ const int* n,
487
+ const cuComplex* alpha,
488
+ const cuComplex* a,
489
+ const int* lda,
490
+ const cuComplex* b,
491
+ const int* ldb,
492
+ const cuComplex* beta,
493
+ cuComplex* c,
494
+ const int* ldc);
495
+
496
+ void zsymm(const char* side,
497
+ const char* uplo,
498
+ const int* m,
499
+ const int* n,
500
+ const cuDoubleComplex* alpha,
501
+ const cuDoubleComplex* a,
502
+ const int* lda,
503
+ const cuDoubleComplex* b,
504
+ const int* ldb,
505
+ const cuDoubleComplex* beta,
506
+ cuDoubleComplex* c,
507
+ const int* ldc);
508
+
509
+ /* HEMM */
510
+ void chemm_(const char* side,
511
+ const char* uplo,
512
+ const int* m,
513
+ const int* n,
514
+ const cuComplex* alpha,
515
+ const cuComplex* a,
516
+ const int* lda,
517
+ const cuComplex* b,
518
+ const int* ldb,
519
+ const cuComplex* beta,
520
+ cuComplex* c,
521
+ const int* ldc);
522
+
523
+ void zhemm_(const char* side,
524
+ const char* uplo,
525
+ const int* m,
526
+ const int* n,
527
+ const cuDoubleComplex* alpha,
528
+ const cuDoubleComplex* a,
529
+ const int* lda,
530
+ const cuDoubleComplex* b,
531
+ const int* ldb,
532
+ const cuDoubleComplex* beta,
533
+ cuDoubleComplex* c,
534
+ const int* ldc);
535
+
536
+ /* HEMM with no underscore*/
537
+ void chemm(const char* side,
538
+ const char* uplo,
539
+ const int* m,
540
+ const int* n,
541
+ const cuComplex* alpha,
542
+ const cuComplex* a,
543
+ const int* lda,
544
+ const cuComplex* b,
545
+ const int* ldb,
546
+ const cuComplex* beta,
547
+ cuComplex* c,
548
+ const int* ldc);
549
+
550
+ void zhemm(const char* side,
551
+ const char* uplo,
552
+ const int* m,
553
+ const int* n,
554
+ const cuDoubleComplex* alpha,
555
+ const cuDoubleComplex* a,
556
+ const int* lda,
557
+ const cuDoubleComplex* b,
558
+ const int* ldb,
559
+ const cuDoubleComplex* beta,
560
+ cuDoubleComplex* c,
561
+ const int* ldc);
562
+
563
+ /* SYR2K */
564
+ void ssyr2k_(const char* uplo,
565
+ const char* trans,
566
+ const int* n,
567
+ const int* k,
568
+ const float* alpha,
569
+ const float* a,
570
+ const int* lda,
571
+ const float* b,
572
+ const int* ldb,
573
+ const float* beta,
574
+ float* c,
575
+ const int* ldc);
576
+
577
+ void dsyr2k_(const char* uplo,
578
+ const char* trans,
579
+ const int* n,
580
+ const int* k,
581
+ const double* alpha,
582
+ const double* a,
583
+ const int* lda,
584
+ const double* b,
585
+ const int* ldb,
586
+ const double* beta,
587
+ double* c,
588
+ const int* ldc);
589
+
590
+ void csyr2k_(const char* uplo,
591
+ const char* trans,
592
+ const int* n,
593
+ const int* k,
594
+ const cuComplex* alpha,
595
+ const cuComplex* a,
596
+ const int* lda,
597
+ const cuComplex* b,
598
+ const int* ldb,
599
+ const cuComplex* beta,
600
+ cuComplex* c,
601
+ const int* ldc);
602
+
603
+ void zsyr2k_(const char* uplo,
604
+ const char* trans,
605
+ const int* n,
606
+ const int* k,
607
+ const cuDoubleComplex* alpha,
608
+ const cuDoubleComplex* a,
609
+ const int* lda,
610
+ const cuDoubleComplex* b,
611
+ const int* ldb,
612
+ const cuDoubleComplex* beta,
613
+ cuDoubleComplex* c,
614
+ const int* ldc);
615
+
616
+ /* SYR2K no_underscore*/
617
+ void ssyr2k(const char* uplo,
618
+ const char* trans,
619
+ const int* n,
620
+ const int* k,
621
+ const float* alpha,
622
+ const float* a,
623
+ const int* lda,
624
+ const float* b,
625
+ const int* ldb,
626
+ const float* beta,
627
+ float* c,
628
+ const int* ldc);
629
+
630
+ void dsyr2k(const char* uplo,
631
+ const char* trans,
632
+ const int* n,
633
+ const int* k,
634
+ const double* alpha,
635
+ const double* a,
636
+ const int* lda,
637
+ const double* b,
638
+ const int* ldb,
639
+ const double* beta,
640
+ double* c,
641
+ const int* ldc);
642
+
643
+ void csyr2k(const char* uplo,
644
+ const char* trans,
645
+ const int* n,
646
+ const int* k,
647
+ const cuComplex* alpha,
648
+ const cuComplex* a,
649
+ const int* lda,
650
+ const cuComplex* b,
651
+ const int* ldb,
652
+ const cuComplex* beta,
653
+ cuComplex* c,
654
+ const int* ldc);
655
+
656
+ void zsyr2k(const char* uplo,
657
+ const char* trans,
658
+ const int* n,
659
+ const int* k,
660
+ const cuDoubleComplex* alpha,
661
+ const cuDoubleComplex* a,
662
+ const int* lda,
663
+ const cuDoubleComplex* b,
664
+ const int* ldb,
665
+ const cuDoubleComplex* beta,
666
+ cuDoubleComplex* c,
667
+ const int* ldc);
668
+
669
+ /* HERK */
670
+ void cher2k_(const char* uplo,
671
+ const char* trans,
672
+ const int* n,
673
+ const int* k,
674
+ const cuComplex* alpha,
675
+ const cuComplex* a,
676
+ const int* lda,
677
+ const cuComplex* b,
678
+ const int* ldb,
679
+ const float* beta,
680
+ cuComplex* c,
681
+ const int* ldc);
682
+
683
+ void zher2k_(const char* uplo,
684
+ const char* trans,
685
+ const int* n,
686
+ const int* k,
687
+ const cuDoubleComplex* alpha,
688
+ const cuDoubleComplex* a,
689
+ const int* lda,
690
+ const cuDoubleComplex* b,
691
+ const int* ldb,
692
+ const double* beta,
693
+ cuDoubleComplex* c,
694
+ const int* ldc);
695
+
696
+ /* HER2K with no underscore */
697
+ void cher2k(const char* uplo,
698
+ const char* trans,
699
+ const int* n,
700
+ const int* k,
701
+ const cuComplex* alpha,
702
+ const cuComplex* a,
703
+ const int* lda,
704
+ const cuComplex* b,
705
+ const int* ldb,
706
+ const float* beta,
707
+ cuComplex* c,
708
+ const int* ldc);
709
+
710
+ void zher2k(const char* uplo,
711
+ const char* trans,
712
+ const int* n,
713
+ const int* k,
714
+ const cuDoubleComplex* alpha,
715
+ const cuDoubleComplex* a,
716
+ const int* lda,
717
+ const cuDoubleComplex* b,
718
+ const int* ldb,
719
+ const double* beta,
720
+ cuDoubleComplex* c,
721
+ const int* ldc);
722
+
723
+ /* TRMM */
724
+ void strmm_(const char* side,
725
+ const char* uplo,
726
+ const char* transa,
727
+ const char* diag,
728
+ const int* m,
729
+ const int* n,
730
+ const float* alpha,
731
+ const float* a,
732
+ const int* lda,
733
+ float* b,
734
+ const int* ldb);
735
+
736
+ void dtrmm_(const char* side,
737
+ const char* uplo,
738
+ const char* transa,
739
+ const char* diag,
740
+ const int* m,
741
+ const int* n,
742
+ const double* alpha,
743
+ const double* a,
744
+ const int* lda,
745
+ double* b,
746
+ const int* ldb);
747
+
748
+ void ctrmm_(const char* side,
749
+ const char* uplo,
750
+ const char* transa,
751
+ const char* diag,
752
+ const int* m,
753
+ const int* n,
754
+ const cuComplex* alpha,
755
+ const cuComplex* a,
756
+ const int* lda,
757
+ cuComplex* b,
758
+ const int* ldb);
759
+
760
+ void ztrmm_(const char* side,
761
+ const char* uplo,
762
+ const char* transa,
763
+ const char* diag,
764
+ const int* m,
765
+ const int* n,
766
+ const cuDoubleComplex* alpha,
767
+ const cuDoubleComplex* a,
768
+ const int* lda,
769
+ cuDoubleComplex* b,
770
+ const int* ldb);
771
+
772
+ void strmm(const char* side,
773
+ const char* uplo,
774
+ const char* transa,
775
+ const char* diag,
776
+ const int* m,
777
+ const int* n,
778
+ const float* alpha,
779
+ const float* a,
780
+ const int* lda,
781
+ float* b,
782
+ const int* ldb);
783
+
784
+ void dtrmm(const char* side,
785
+ const char* uplo,
786
+ const char* transa,
787
+ const char* diag,
788
+ const int* m,
789
+ const int* n,
790
+ const double* alpha,
791
+ const double* a,
792
+ const int* lda,
793
+ double* b,
794
+ const int* ldb);
795
+
796
+ void ctrmm(const char* side,
797
+ const char* uplo,
798
+ const char* transa,
799
+ const char* diag,
800
+ const int* m,
801
+ const int* n,
802
+ const cuComplex* alpha,
803
+ const cuComplex* a,
804
+ const int* lda,
805
+ cuComplex* b,
806
+ const int* ldb);
807
+
808
+ void ztrmm(const char* side,
809
+ const char* uplo,
810
+ const char* transa,
811
+ const char* diag,
812
+ const int* m,
813
+ const int* n,
814
+ const cuDoubleComplex* alpha,
815
+ const cuDoubleComplex* a,
816
+ const int* lda,
817
+ cuDoubleComplex* b,
818
+ const int* ldb);
819
+
820
+ #if defined(__cplusplus)
821
+ }
822
+ #endif /* __cplusplus */
823
+
824
+ #endif /* !defined(NVBLAS_H_) */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (218 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py ADDED
File without changes
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (218 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py ADDED
File without changes
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (226 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h ADDED
@@ -0,0 +1,758 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //
2
+ // NVIDIA_COPYRIGHT_BEGIN
3
+ //
4
+ // Copyright (c) 2014-2022, NVIDIA CORPORATION. All rights reserved.
5
+ //
6
+ // NVIDIA CORPORATION and its licensors retain all intellectual property
7
+ // and proprietary rights in and to this software, related documentation
8
+ // and any modifications thereto. Any use, reproduction, disclosure or
9
+ // distribution of this software and related documentation without an express
10
+ // license agreement from NVIDIA CORPORATION is strictly prohibited.
11
+ //
12
+ // NVIDIA_COPYRIGHT_END
13
+ //
14
+
15
+ #ifndef __NVRTC_H__
16
+ #define __NVRTC_H__
17
+
18
+ #ifdef __cplusplus
19
+ extern "C" {
20
+ #endif /* __cplusplus */
21
+
22
+ #include <stdlib.h>
23
+
24
+
25
+ /*************************************************************************//**
26
+ *
27
+ * \defgroup error Error Handling
28
+ *
29
+ * NVRTC defines the following enumeration type and function for API call
30
+ * error handling.
31
+ *
32
+ ****************************************************************************/
33
+
34
+
35
+ /**
36
+ * \ingroup error
37
+ * \brief The enumerated type nvrtcResult defines API call result codes.
38
+ * NVRTC API functions return nvrtcResult to indicate the call
39
+ * result.
40
+ */
41
+ typedef enum {
42
+ NVRTC_SUCCESS = 0,
43
+ NVRTC_ERROR_OUT_OF_MEMORY = 1,
44
+ NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
45
+ NVRTC_ERROR_INVALID_INPUT = 3,
46
+ NVRTC_ERROR_INVALID_PROGRAM = 4,
47
+ NVRTC_ERROR_INVALID_OPTION = 5,
48
+ NVRTC_ERROR_COMPILATION = 6,
49
+ NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
50
+ NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
51
+ NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
52
+ NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
53
+ NVRTC_ERROR_INTERNAL_ERROR = 11
54
+ } nvrtcResult;
55
+
56
+
57
+ /**
58
+ * \ingroup error
59
+ * \brief nvrtcGetErrorString is a helper function that returns a string
60
+ * describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to
61
+ * \c "NVRTC_SUCCESS".
62
+ * For unrecognized enumeration values, it returns
63
+ * \c "NVRTC_ERROR unknown".
64
+ *
65
+ * \param [in] result CUDA Runtime Compilation API result code.
66
+ * \return Message string for the given #nvrtcResult code.
67
+ */
68
+ const char *nvrtcGetErrorString(nvrtcResult result);
69
+
70
+
71
+ /*************************************************************************//**
72
+ *
73
+ * \defgroup query General Information Query
74
+ *
75
+ * NVRTC defines the following function for general information query.
76
+ *
77
+ ****************************************************************************/
78
+
79
+
80
+ /**
81
+ * \ingroup query
82
+ * \brief nvrtcVersion sets the output parameters \p major and \p minor
83
+ * with the CUDA Runtime Compilation version number.
84
+ *
85
+ * \param [out] major CUDA Runtime Compilation major version number.
86
+ * \param [out] minor CUDA Runtime Compilation minor version number.
87
+ * \return
88
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
89
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
90
+ *
91
+ */
92
+ nvrtcResult nvrtcVersion(int *major, int *minor);
93
+
94
+
95
+ /**
96
+ * \ingroup query
97
+ * \brief nvrtcGetNumSupportedArchs sets the output parameter \p numArchs
98
+ * with the number of architectures supported by NVRTC. This can
99
+ * then be used to pass an array to ::nvrtcGetSupportedArchs to
100
+ * get the supported architectures.
101
+ *
102
+ * \param [out] numArchs number of supported architectures.
103
+ * \return
104
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
105
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
106
+ *
107
+ * see ::nvrtcGetSupportedArchs
108
+ */
109
+ nvrtcResult nvrtcGetNumSupportedArchs(int* numArchs);
110
+
111
+
112
+ /**
113
+ * \ingroup query
114
+ * \brief nvrtcGetSupportedArchs populates the array passed via the output parameter
115
+ * \p supportedArchs with the architectures supported by NVRTC. The array is
116
+ * sorted in the ascending order. The size of the array to be passed can be
117
+ * determined using ::nvrtcGetNumSupportedArchs.
118
+ *
119
+ * \param [out] supportedArchs sorted array of supported architectures.
120
+ * \return
121
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
122
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
123
+ *
124
+ * see ::nvrtcGetNumSupportedArchs
125
+ */
126
+ nvrtcResult nvrtcGetSupportedArchs(int* supportedArchs);
127
+
128
+
129
+ /*************************************************************************//**
130
+ *
131
+ * \defgroup compilation Compilation
132
+ *
133
+ * NVRTC defines the following type and functions for actual compilation.
134
+ *
135
+ ****************************************************************************/
136
+
137
+
138
+ /**
139
+ * \ingroup compilation
140
+ * \brief nvrtcProgram is the unit of compilation, and an opaque handle for
141
+ * a program.
142
+ *
143
+ * To compile a CUDA program string, an instance of nvrtcProgram must be
144
+ * created first with ::nvrtcCreateProgram, then compiled with
145
+ * ::nvrtcCompileProgram.
146
+ */
147
+ typedef struct _nvrtcProgram *nvrtcProgram;
148
+
149
+
150
+ /**
151
+ * \ingroup compilation
152
+ * \brief nvrtcCreateProgram creates an instance of nvrtcProgram with the
153
+ * given input parameters, and sets the output parameter \p prog with
154
+ * it.
155
+ *
156
+ * \param [out] prog CUDA Runtime Compilation program.
157
+ * \param [in] src CUDA program source.
158
+ * \param [in] name CUDA program name.\n
159
+ * \p name can be \c NULL; \c "default_program" is
160
+ * used when \p name is \c NULL or "".
161
+ * \param [in] numHeaders Number of headers used.\n
162
+ * \p numHeaders must be greater than or equal to 0.
163
+ * \param [in] headers Sources of the headers.\n
164
+ * \p headers can be \c NULL when \p numHeaders is
165
+ * 0.
166
+ * \param [in] includeNames Name of each header by which they can be
167
+ * included in the CUDA program source.\n
168
+ * \p includeNames can be \c NULL when \p numHeaders
169
+ * is 0.
170
+ * \return
171
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
172
+ * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
173
+ * - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
174
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
175
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
176
+ *
177
+ * \see ::nvrtcDestroyProgram
178
+ */
179
+ nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
180
+ const char *src,
181
+ const char *name,
182
+ int numHeaders,
183
+ const char * const *headers,
184
+ const char * const *includeNames);
185
+
186
+
187
+ /**
188
+ * \ingroup compilation
189
+ * \brief nvrtcDestroyProgram destroys the given program.
190
+ *
191
+ * \param [in] prog CUDA Runtime Compilation program.
192
+ * \return
193
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
194
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
195
+ *
196
+ * \see ::nvrtcCreateProgram
197
+ */
198
+ nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
199
+
200
+
201
+ /**
202
+ * \ingroup compilation
203
+ * \brief nvrtcCompileProgram compiles the given program.
204
+ *
205
+ * \param [in] prog CUDA Runtime Compilation program.
206
+ * \param [in] numOptions Number of compiler options passed.
207
+ * \param [in] options Compiler options in the form of C string array.\n
208
+ * \p options can be \c NULL when \p numOptions is 0.
209
+ *
210
+ * \return
211
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
212
+ * - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
213
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
214
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
215
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
216
+ * - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
217
+ * - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
218
+ *
219
+ * It supports compile options listed in \ref options.
220
+ */
221
+ nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
222
+ int numOptions, const char * const *options);
223
+
224
+
225
+ /**
226
+ * \ingroup compilation
227
+ * \brief nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX
228
+ * generated by the previous compilation of \p prog (including the
229
+ * trailing \c NULL).
230
+ *
231
+ * \param [in] prog CUDA Runtime Compilation program.
232
+ * \param [out] ptxSizeRet Size of the generated PTX (including the trailing
233
+ * \c NULL).
234
+ * \return
235
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
236
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
237
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
238
+ *
239
+ * \see ::nvrtcGetPTX
240
+ */
241
+ nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
242
+
243
+
244
+ /**
245
+ * \ingroup compilation
246
+ * \brief nvrtcGetPTX stores the PTX generated by the previous compilation
247
+ * of \p prog in the memory pointed by \p ptx.
248
+ *
249
+ * \param [in] prog CUDA Runtime Compilation program.
250
+ * \param [out] ptx Compiled result.
251
+ * \return
252
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
253
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
254
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
255
+ *
256
+ * \see ::nvrtcGetPTXSize
257
+ */
258
+ nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
259
+
260
+
261
+ /**
262
+ * \ingroup compilation
263
+ * \brief nvrtcGetCUBINSize sets \p cubinSizeRet with the size of the cubin
264
+ * generated by the previous compilation of \p prog. The value of
265
+ * cubinSizeRet is set to 0 if the value specified to \c -arch is a
266
+ * virtual architecture instead of an actual architecture.
267
+ *
268
+ * \param [in] prog CUDA Runtime Compilation program.
269
+ * \param [out] cubinSizeRet Size of the generated cubin.
270
+ * \return
271
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
272
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
273
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
274
+ *
275
+ * \see ::nvrtcGetCUBIN
276
+ */
277
+ nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t *cubinSizeRet);
278
+
279
+
280
+ /**
281
+ * \ingroup compilation
282
+ * \brief nvrtcGetCUBIN stores the cubin generated by the previous compilation
283
+ * of \p prog in the memory pointed by \p cubin. No cubin is available
284
+ * if the value specified to \c -arch is a virtual architecture instead
285
+ * of an actual architecture.
286
+ *
287
+ * \param [in] prog CUDA Runtime Compilation program.
288
+ * \param [out] cubin Compiled and assembled result.
289
+ * \return
290
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
291
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
292
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
293
+ *
294
+ * \see ::nvrtcGetCUBINSize
295
+ */
296
+ nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
297
+
298
+ /**
299
+ * \ingroup compilation
300
+ * \brief nvrtcGetNVVMSize sets \p nvvmSizeRet with the size of the NVVM
301
+ * generated by the previous compilation of \p prog. The value of
302
+ * nvvmSizeRet is set to 0 if the program was not compiled with
303
+ * \c -dlto.
304
+ *
305
+ * \param [in] prog CUDA Runtime Compilation program.
306
+ * \param [out] nvvmSizeRet Size of the generated NVVM.
307
+ * \return
308
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
309
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
310
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
311
+ *
312
+ * \see ::nvrtcGetNVVM
313
+ */
314
+ nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t *nvvmSizeRet);
315
+
316
+
317
+ /**
318
+ * \ingroup compilation
319
+ * \brief nvrtcGetNVVM stores the NVVM generated by the previous compilation
320
+ * of \p prog in the memory pointed by \p nvvm.
321
+ * The program must have been compiled with -dlto,
322
+ * otherwise will return an error.
323
+ *
324
+ * \param [in] prog CUDA Runtime Compilation program.
325
+ * \param [out] nvvm Compiled result.
326
+ * \return
327
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
328
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
329
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
330
+ *
331
+ * \see ::nvrtcGetNVVMSize
332
+ */
333
+ nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char *nvvm);
334
+
335
+ /**
336
+ * \ingroup compilation
337
+ * \brief nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
338
+ * log generated by the previous compilation of \p prog (including the
339
+ * trailing \c NULL).
340
+ *
341
+ * Note that compilation log may be generated with warnings and informative
342
+ * messages, even when the compilation of \p prog succeeds.
343
+ *
344
+ * \param [in] prog CUDA Runtime Compilation program.
345
+ * \param [out] logSizeRet Size of the compilation log
346
+ * (including the trailing \c NULL).
347
+ * \return
348
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
349
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
350
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
351
+ *
352
+ * \see ::nvrtcGetProgramLog
353
+ */
354
+ nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
355
+
356
+
357
+ /**
358
+ * \ingroup compilation
359
+ * \brief nvrtcGetProgramLog stores the log generated by the previous
360
+ * compilation of \p prog in the memory pointed by \p log.
361
+ *
362
+ * \param [in] prog CUDA Runtime Compilation program.
363
+ * \param [out] log Compilation log.
364
+ * \return
365
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
366
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
367
+ * - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
368
+ *
369
+ * \see ::nvrtcGetProgramLogSize
370
+ */
371
+ nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
372
+
373
+
374
+ /**
375
+ * \ingroup compilation
376
+ * \brief nvrtcAddNameExpression notes the given name expression
377
+ * denoting the address of a __global__ function
378
+ * or __device__/__constant__ variable.
379
+ *
380
+ * The identical name expression string must be provided on a subsequent
381
+ * call to nvrtcGetLoweredName to extract the lowered name.
382
+ * \param [in] prog CUDA Runtime Compilation program.
383
+ * \param [in] name_expression constant expression denoting the address of
384
+ * a __global__ function or __device__/__constant__ variable.
385
+ * \return
386
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
387
+ * - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink
388
+ *
389
+ * \see ::nvrtcGetLoweredName
390
+ */
391
+ nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog,
392
+ const char * const name_expression);
393
+
394
+ /**
395
+ * \ingroup compilation
396
+ * \brief nvrtcGetLoweredName extracts the lowered (mangled) name
397
+ * for a __global__ function or __device__/__constant__ variable,
398
+ * and updates *lowered_name to point to it. The memory containing
399
+ * the name is released when the NVRTC program is destroyed by
400
+ * nvrtcDestroyProgram.
401
+ * The identical name expression must have been previously
402
+ * provided to nvrtcAddNameExpression.
403
+ *
404
+ * \param [in] prog CUDA Runtime Compilation program.
405
+ * \param [in] name_expression constant expression denoting the address of
406
+ * a __global__ function or __device__/__constant__ variable.
407
+ * \param [out] lowered_name initialized by the function to point to a
408
+ * C string containing the lowered (mangled)
409
+ * name corresponding to the provided name expression.
410
+ * \return
411
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
412
+ * - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink
413
+ * - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink
414
+ *
415
+ * \see ::nvrtcAddNameExpression
416
+ */
417
+ nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog,
418
+ const char *const name_expression,
419
+ const char** lowered_name);
420
+
421
+
422
+ /**
423
+ * \defgroup options Supported Compile Options
424
+ *
425
+ * NVRTC supports the compile options below.
426
+ * Option names with two preceding dashs (\c --) are long option names and
427
+ * option names with one preceding dash (\c -) are short option names.
428
+ * Short option names can be used instead of long option names.
429
+ * When a compile option takes an argument, an assignment operator (\c =)
430
+ * is used to separate the compile option argument from the compile option
431
+ * name, e.g., \c "--gpu-architecture=compute_60".
432
+ * Alternatively, the compile option name and the argument can be specified in
433
+ * separate strings without an assignment operator, .e.g,
434
+ * \c "--gpu-architecture" \c "compute_60".
435
+ * Single-character short option names, such as \c -D, \c -U, and \c -I, do
436
+ * not require an assignment operator, and the compile option name and the
437
+ * argument can be present in the same string with or without spaces between
438
+ * them.
439
+ * For instance, \c "-D=<def>", \c "-D<def>", and \c "-D <def>" are all
440
+ * supported.
441
+ *
442
+ * The valid compiler options are:
443
+ *
444
+ * - Compilation targets
445
+ * - \c --gpu-architecture=\<arch\> (\c -arch)\n
446
+ * Specify the name of the class of GPU architectures for which the
447
+ * input must be compiled.\n
448
+ * - Valid <c>\<arch\></c>s:
449
+ * - \c compute_35
450
+ * - \c compute_37
451
+ * - \c compute_50
452
+ * - \c compute_52
453
+ * - \c compute_53
454
+ * - \c compute_60
455
+ * - \c compute_61
456
+ * - \c compute_62
457
+ * - \c compute_70
458
+ * - \c compute_72
459
+ * - \c compute_75
460
+ * - \c compute_80
461
+ * - \c compute_87
462
+ * - \c compute_89
463
+ * - \c compute_90
464
+ * - \c sm_35
465
+ * - \c sm_37
466
+ * - \c sm_50
467
+ * - \c sm_52
468
+ * - \c sm_53
469
+ * - \c sm_60
470
+ * - \c sm_61
471
+ * - \c sm_62
472
+ * - \c sm_70
473
+ * - \c sm_72
474
+ * - \c sm_75
475
+ * - \c sm_80
476
+ * - \c sm_87
477
+ * - \c sm_89
478
+ * - \c sm_90
479
+ * - Default: \c compute_52
480
+ * - Separate compilation / whole-program compilation
481
+ * - \c --device-c (\c -dc)\n
482
+ * Generate relocatable code that can be linked with other relocatable
483
+ * device code. It is equivalent to --relocatable-device-code=true.
484
+ * - \c --device-w (\c -dw)\n
485
+ * Generate non-relocatable code. It is equivalent to
486
+ * \c --relocatable-device-code=false.
487
+ * - \c --relocatable-device-code={true|false} (\c -rdc)\n
488
+ * Enable (disable) the generation of relocatable device code.
489
+ * - Default: \c false
490
+ * - \c --extensible-whole-program (\c -ewp)\n
491
+ * Do extensible whole program compilation of device code.
492
+ * - Default: \c false
493
+ * - Debugging support
494
+ * - \c --device-debug (\c -G)\n
495
+ * Generate debug information. If --dopt is not specified,
496
+ * then turns off all optimizations.
497
+ * - \c --generate-line-info (\c -lineinfo)\n
498
+ * Generate line-number information.
499
+ * - Code generation
500
+ * - \c --dopt on (\c -dopt)\n
501
+ * - \c --dopt=on \n
502
+ * Enable device code optimization. When specified along with '-G', enables
503
+ * limited debug information generation for optimized device code (currently,
504
+ * only line number information).
505
+ * When '-G' is not specified, '-dopt=on' is implicit.
506
+ * - \c --ptxas-options \<options\> (\c -Xptxas)\n
507
+ * - \c --ptxas-options=\<options\> \n
508
+ * Specify options directly to ptxas, the PTX optimizing assembler.
509
+ * - \c --maxrregcount=\<N\> (\c -maxrregcount)\n
510
+ * Specify the maximum amount of registers that GPU functions can use.
511
+ * Until a function-specific limit, a higher value will generally
512
+ * increase the performance of individual GPU threads that execute this
513
+ * function. However, because thread registers are allocated from a
514
+ * global register pool on each GPU, a higher value of this option will
515
+ * also reduce the maximum thread block size, thereby reducing the amount
516
+ * of thread parallelism. Hence, a good maxrregcount value is the result
517
+ * of a trade-off. If this option is not specified, then no maximum is
518
+ * assumed. Value less than the minimum registers required by ABI will
519
+ * be bumped up by the compiler to ABI minimum limit.
520
+ * - \c --ftz={true|false} (\c -ftz)\n
521
+ * When performing single-precision floating-point operations, flush
522
+ * denormal values to zero or preserve denormal values.
523
+ * \c --use_fast_math implies \c --ftz=true.
524
+ * - Default: \c false
525
+ * - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n
526
+ * For single-precision floating-point square root, use IEEE
527
+ * round-to-nearest mode or use a faster approximation.
528
+ * \c --use_fast_math implies \c --prec-sqrt=false.
529
+ * - Default: \c true
530
+ * - \c --prec-div={true|false} (\c -prec-div)\n
531
+ * For single-precision floating-point division and reciprocals, use IEEE
532
+ * round-to-nearest mode or use a faster approximation.
533
+ * \c --use_fast_math implies \c --prec-div=false.
534
+ * - Default: \c true
535
+ * - \c --fmad={true|false} (\c -fmad)\n
536
+ * Enables (disables) the contraction of floating-point multiplies and
537
+ * adds/subtracts into floating-point multiply-add operations (FMAD,
538
+ * FFMA, or DFMA). \c --use_fast_math implies \c --fmad=true.
539
+ * - Default: \c true
540
+ * - \c --use_fast_math (\c -use_fast_math)\n
541
+ * Make use of fast math operations.
542
+ * \c --use_fast_math implies \c --ftz=true \c --prec-div=false
543
+ * \c --prec-sqrt=false \c --fmad=true.
544
+ * - \c --extra-device-vectorization (\c -extra-device-vectorization)\n
545
+ * Enables more aggressive device code vectorization in the NVVM optimizer.
546
+ * - \c --modify-stack-limit={true|false} (\c -modify-stack-limit)\n
547
+ * On Linux, during compilation, use \c setrlimit() to increase stack size
548
+ * to maximum allowed. The limit is reset to the previous value at the
549
+ * end of compilation.
550
+ * Note: \c setrlimit() changes the value for the entire process.
551
+ * - Default: \c true
552
+ * - \c --dlink-time-opt (\c -dlto)\n
553
+ * Generate intermediate code for later link-time optimization.
554
+ * It implies \c -rdc=true.
555
+ * Note: when this is used the nvrtcGetNVVM API should be used,
556
+ * as PTX or Cubin will not be generated.
557
+ * - Preprocessing
558
+ * - \c --define-macro=\<def\> (\c -D)\n
559
+ * \c \<def\> can be either \c \<name\> or \c \<name=definitions\>.
560
+ * - \c \<name\> \n
561
+ * Predefine \c \<name\> as a macro with definition \c 1.
562
+ * - \c \<name\>=\<definition\> \n
563
+ * The contents of \c \<definition\> are tokenized and preprocessed
564
+ * as if they appeared during translation phase three in a \c \#define
565
+ * directive. In particular, the definition will be truncated by
566
+ * embedded new line characters.
567
+ * - \c --undefine-macro=\<def\> (\c -U)\n
568
+ * Cancel any previous definition of \c \<def\>.
569
+ * - \c --include-path=\<dir\> (\c -I)\n
570
+ * Add the directory \c \<dir\> to the list of directories to be
571
+ * searched for headers. These paths are searched after the list of
572
+ * headers given to ::nvrtcCreateProgram.
573
+ * - \c --pre-include=\<header\> (\c -include)\n
574
+ * Preinclude \c \<header\> during preprocessing.
575
+ * - \c --no-source-include (\c -no-source-include)
576
+ * The preprocessor by default adds the directory of each input sources
577
+ * to the include path. This option disables this feature and only
578
+ * considers the path specified explicitly.
579
+ * - Language Dialect
580
+ * - \c --std={c++03|c++11|c++14|c++17}
581
+ * (\c -std={c++11|c++14|c++17})\n
582
+ * Set language dialect to C++03, C++11, C++14 or C++17
583
+ * - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n
584
+ * Provide builtin definitions of \c std::move and \c std::forward,
585
+ * when C++11 language dialect is selected.
586
+ * - Default: \c true
587
+ * - \c --builtin-initializer-list={true|false}
588
+ * (\c -builtin-initializer-list)\n
589
+ * Provide builtin definitions of \c std::initializer_list class and
590
+ * member functions when C++11 language dialect is selected.
591
+ * - Default: \c true
592
+ * - Misc.
593
+ * - \c --disable-warnings (\c -w)\n
594
+ * Inhibit all warning messages.
595
+ * - \c --restrict (\c -restrict)\n
596
+ * Programmer assertion that all kernel pointer parameters are restrict
597
+ * pointers.
598
+ * - \c --device-as-default-execution-space
599
+ * (\c -default-device)\n
600
+ * Treat entities with no execution space annotation as \c __device__
601
+ * entities.
602
+ * - \c --device-int128 (\c -device-int128)\n
603
+ * Allow the \c __int128 type in device code. Also causes the macro \c __CUDACC_RTC_INT128__
604
+ * to be defined.
605
+ * - \c --optimization-info=\<kind\> (\c -opt-info)\n
606
+ * Provide optimization reports for the specified kind of optimization.
607
+ * The following kind tags are supported:
608
+ * - \c inline : emit a remark when a function is inlined.
609
+ * - \c --version-ident={true|false} (\c -dQ)\n
610
+ * Embed used compiler's version info into generated PTX/CUBIN
611
+ * - Default: \c false
612
+ * - \c --display-error-number (\c -err-no)\n
613
+ * Display diagnostic number for warning messages. (Default)
614
+ * - \c --no-display-error-number (\c -no-err-no)\n
615
+ * Disables the display of a diagnostic number for warning messages.
616
+ * - \c --diag-error=<error-number>,... (\c -diag-error)\n
617
+ * Emit error for specified diagnostic message number(s). Message numbers can be separated by comma.
618
+ * - \c --diag-suppress=<error-number>,... (\c -diag-suppress)\n
619
+ * Suppress specified diagnostic message number(s). Message numbers can be separated by comma.
620
+ * - \c --diag-warn=<error-number>,... (\c -diag-warn)\n
621
+ * Emit warning for specified diagnostic message number(s). Message numbers can be separated by comma.
622
+ *
623
+ */
624
+
625
+
626
+ #ifdef __cplusplus
627
+ }
628
+ #endif /* __cplusplus */
629
+
630
+
631
+ /* The utility function 'nvrtcGetTypeName' is not available by default. Define
632
+ the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available.
633
+ */
634
+
635
+ #if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__
636
+
637
+ #if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__
638
+ #include <cxxabi.h>
639
+ #include <cstdlib>
640
+
641
+ #elif defined(_WIN32)
642
+ #include <Windows.h>
643
+ #include <DbgHelp.h>
644
+ #endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */
645
+
646
+
647
+ #include <string>
648
+ #include <typeinfo>
649
+
650
+ template <typename T> struct __nvrtcGetTypeName_helper_t { };
651
+
652
+ /*************************************************************************//**
653
+ *
654
+ * \defgroup hosthelper Host Helper
655
+ *
656
+ * NVRTC defines the following functions for easier interaction with host code.
657
+ *
658
+ ****************************************************************************/
659
+
660
+ /**
661
+ * \ingroup hosthelper
662
+ * \brief nvrtcGetTypeName stores the source level name of a type in the given
663
+ * std::string location.
664
+ *
665
+ * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
666
+ * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
667
+ * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
668
+ * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
669
+ * otherwise *result is initialized with the extracted name.
670
+ *
671
+ * Windows-specific notes:
672
+ * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(),
673
+ * which is not multi-thread safe.
674
+ * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
675
+ *
676
+ * \param [in] tinfo: reference to object of type std::type_info for a given type.
677
+ * \param [in] result: pointer to std::string in which to store the type name.
678
+ * \return
679
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
680
+ * - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
681
+ *
682
+ */
683
+ inline nvrtcResult nvrtcGetTypeName(const std::type_info &tinfo, std::string *result)
684
+ {
685
+ #if USE_CXXABI || __clang__ || __GNUC__
686
+ const char *name = tinfo.name();
687
+ int status;
688
+ char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status);
689
+ if (status == 0) {
690
+ *result = undecorated_name;
691
+ free(undecorated_name);
692
+ return NVRTC_SUCCESS;
693
+ }
694
+ #elif defined(_WIN32)
695
+ const char *name = tinfo.raw_name();
696
+ if (!name || *name != '.') {
697
+ return NVRTC_ERROR_INTERNAL_ERROR;
698
+ }
699
+ char undecorated_name[4096];
700
+ //name+1 skips over the '.' prefix
701
+ if(UnDecorateSymbolName(name+1, undecorated_name,
702
+ sizeof(undecorated_name) / sizeof(*undecorated_name),
703
+ //note: doesn't seem to work correctly without UNDNAME_NO_ARGUMENTS.
704
+ UNDNAME_NO_ARGUMENTS | UNDNAME_NAME_ONLY ) ) {
705
+ *result = undecorated_name;
706
+ return NVRTC_SUCCESS;
707
+ }
708
+ #endif /* USE_CXXABI || __clang__ || __GNUC__ */
709
+
710
+ return NVRTC_ERROR_INTERNAL_ERROR;
711
+ }
712
+
713
+ /**
714
+ * \ingroup hosthelper
715
+ * \brief nvrtcGetTypeName stores the source level name of the template type argument
716
+ * T in the given std::string location.
717
+ *
718
+ * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
719
+ * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
720
+ * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
721
+ * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
722
+ * otherwise *result is initialized with the extracted name.
723
+ *
724
+ * Windows-specific notes:
725
+ * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(),
726
+ * which is not multi-thread safe.
727
+ * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
728
+ *
729
+ * \param [in] result: pointer to std::string in which to store the type name.
730
+ * \return
731
+ * - \link #nvrtcResult NVRTC_SUCCESS \endlink
732
+ * - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
733
+ *
734
+ */
735
+
736
+ template <typename T>
737
+ nvrtcResult nvrtcGetTypeName(std::string *result)
738
+ {
739
+ nvrtcResult res = nvrtcGetTypeName(typeid(__nvrtcGetTypeName_helper_t<T>),
740
+ result);
741
+ if (res != NVRTC_SUCCESS)
742
+ return res;
743
+
744
+ std::string repr = *result;
745
+ std::size_t idx = repr.find("__nvrtcGetTypeName_helper_t");
746
+ idx = (idx != std::string::npos) ? repr.find("<", idx) : idx;
747
+ std::size_t last_idx = repr.find_last_of('>');
748
+ if (idx == std::string::npos || last_idx == std::string::npos) {
749
+ return NVRTC_ERROR_INTERNAL_ERROR;
750
+ }
751
+ ++idx;
752
+ *result = repr.substr(idx, last_idx - idx);
753
+ return NVRTC_SUCCESS;
754
+ }
755
+
756
+ #endif /* NVRTC_GET_TYPE_NAME */
757
+
758
+ #endif /* __NVRTC_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (222 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(CU_COMPLEX_H_)
51
+ #define CU_COMPLEX_H_
52
+
53
+ #if !defined(__CUDACC_RTC__)
54
+ #if defined(__GNUC__)
55
+ #if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
56
+ #pragma GCC diagnostic ignored "-Wunused-function"
57
+ #endif
58
+ #endif
59
+ #endif
60
+
61
+ /* When trying to include C header file in C++ Code extern "C" is required
62
+ * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
63
+ * extern "C" cannot be nested
64
+ * Hence keep the header out of extern "C" block
65
+ */
66
+
67
+ #if !defined(__CUDACC__)
68
+ #include <math.h> /* import fabsf, sqrt */
69
+ #endif /* !defined(__CUDACC__) */
70
+
71
+ #if defined(__cplusplus)
72
+ extern "C" {
73
+ #endif /* __cplusplus */
74
+
75
+ #include "vector_types.h"
76
+
77
+ typedef float2 cuFloatComplex;
78
+
79
+ __host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
80
+ {
81
+ return x.x;
82
+ }
83
+
84
+ __host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
85
+ {
86
+ return x.y;
87
+ }
88
+
89
+ __host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
90
+ (float r, float i)
91
+ {
92
+ cuFloatComplex res;
93
+ res.x = r;
94
+ res.y = i;
95
+ return res;
96
+ }
97
+
98
+ __host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
99
+ {
100
+ return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
101
+ }
102
+ __host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
103
+ cuFloatComplex y)
104
+ {
105
+ return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),
106
+ cuCimagf(x) + cuCimagf(y));
107
+ }
108
+
109
+ __host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
110
+ cuFloatComplex y)
111
+ {
112
+ return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),
113
+ cuCimagf(x) - cuCimagf(y));
114
+ }
115
+
116
+ /* This implementation could suffer from intermediate overflow even though
117
+ * the final result would be in range. However, various implementations do
118
+ * not guard against this (presumably to avoid losing performance), so we
119
+ * don't do it either to stay competitive.
120
+ */
121
+ __host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
122
+ cuFloatComplex y)
123
+ {
124
+ cuFloatComplex prod;
125
+ prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) -
126
+ (cuCimagf(x) * cuCimagf(y)),
127
+ (cuCrealf(x) * cuCimagf(y)) +
128
+ (cuCimagf(x) * cuCrealf(y)));
129
+ return prod;
130
+ }
131
+
132
+ /* This implementation guards against intermediate underflow and overflow
133
+ * by scaling. Such guarded implementations are usually the default for
134
+ * complex library implementations, with some also offering an unguarded,
135
+ * faster version.
136
+ */
137
+ __host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
138
+ cuFloatComplex y)
139
+ {
140
+ cuFloatComplex quot;
141
+ float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
142
+ float oos = 1.0f / s;
143
+ float ars = cuCrealf(x) * oos;
144
+ float ais = cuCimagf(x) * oos;
145
+ float brs = cuCrealf(y) * oos;
146
+ float bis = cuCimagf(y) * oos;
147
+ s = (brs * brs) + (bis * bis);
148
+ oos = 1.0f / s;
149
+ quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
150
+ ((ais * brs) - (ars * bis)) * oos);
151
+ return quot;
152
+ }
153
+
154
+ /*
155
+ * We would like to call hypotf(), but it's not available on all platforms.
156
+ * This discrete implementation guards against intermediate underflow and
157
+ * overflow by scaling. Otherwise we would lose half the exponent range.
158
+ * There are various ways of doing guarded computation. For now chose the
159
+ * simplest and fastest solution, however this may suffer from inaccuracies
160
+ * if sqrt and division are not IEEE compliant.
161
+ */
162
+ __host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
163
+ {
164
+ float a = cuCrealf(x);
165
+ float b = cuCimagf(x);
166
+ float v, w, t;
167
+ a = fabsf(a);
168
+ b = fabsf(b);
169
+ if (a > b) {
170
+ v = a;
171
+ w = b;
172
+ } else {
173
+ v = b;
174
+ w = a;
175
+ }
176
+ t = w / v;
177
+ t = 1.0f + t * t;
178
+ t = v * sqrtf(t);
179
+ if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
180
+ t = v + w;
181
+ }
182
+ return t;
183
+ }
184
+
185
+ /* Double precision */
186
+ typedef double2 cuDoubleComplex;
187
+
188
+ __host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
189
+ {
190
+ return x.x;
191
+ }
192
+
193
+ __host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)
194
+ {
195
+ return x.y;
196
+ }
197
+
198
+ __host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex
199
+ (double r, double i)
200
+ {
201
+ cuDoubleComplex res;
202
+ res.x = r;
203
+ res.y = i;
204
+ return res;
205
+ }
206
+
207
+ __host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
208
+ {
209
+ return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
210
+ }
211
+
212
+ __host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
213
+ cuDoubleComplex y)
214
+ {
215
+ return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),
216
+ cuCimag(x) + cuCimag(y));
217
+ }
218
+
219
+ __host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
220
+ cuDoubleComplex y)
221
+ {
222
+ return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),
223
+ cuCimag(x) - cuCimag(y));
224
+ }
225
+
226
+ /* This implementation could suffer from intermediate overflow even though
227
+ * the final result would be in range. However, various implementations do
228
+ * not guard against this (presumably to avoid losing performance), so we
229
+ * don't do it either to stay competitive.
230
+ */
231
+ __host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
232
+ cuDoubleComplex y)
233
+ {
234
+ cuDoubleComplex prod;
235
+ prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -
236
+ (cuCimag(x) * cuCimag(y)),
237
+ (cuCreal(x) * cuCimag(y)) +
238
+ (cuCimag(x) * cuCreal(y)));
239
+ return prod;
240
+ }
241
+
242
+ /* This implementation guards against intermediate underflow and overflow
243
+ * by scaling. Such guarded implementations are usually the default for
244
+ * complex library implementations, with some also offering an unguarded,
245
+ * faster version.
246
+ */
247
+ __host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
248
+ cuDoubleComplex y)
249
+ {
250
+ cuDoubleComplex quot;
251
+ double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
252
+ double oos = 1.0 / s;
253
+ double ars = cuCreal(x) * oos;
254
+ double ais = cuCimag(x) * oos;
255
+ double brs = cuCreal(y) * oos;
256
+ double bis = cuCimag(y) * oos;
257
+ s = (brs * brs) + (bis * bis);
258
+ oos = 1.0 / s;
259
+ quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
260
+ ((ais * brs) - (ars * bis)) * oos);
261
+ return quot;
262
+ }
263
+
264
+ /* This implementation guards against intermediate underflow and overflow
265
+ * by scaling. Otherwise we would lose half the exponent range. There are
266
+ * various ways of doing guarded computation. For now chose the simplest
267
+ * and fastest solution, however this may suffer from inaccuracies if sqrt
268
+ * and division are not IEEE compliant.
269
+ */
270
+ __host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
271
+ {
272
+ double a = cuCreal(x);
273
+ double b = cuCimag(x);
274
+ double v, w, t;
275
+ a = fabs(a);
276
+ b = fabs(b);
277
+ if (a > b) {
278
+ v = a;
279
+ w = b;
280
+ } else {
281
+ v = b;
282
+ w = a;
283
+ }
284
+ t = w / v;
285
+ t = 1.0 + t * t;
286
+ t = v * sqrt(t);
287
+ if ((v == 0.0) ||
288
+ (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
289
+ t = v + w;
290
+ }
291
+ return t;
292
+ }
293
+
294
+ #if defined(__cplusplus)
295
+ }
296
+ #endif /* __cplusplus */
297
+
298
+ /* aliases */
299
+ typedef cuFloatComplex cuComplex;
300
+ __host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
301
+ float y)
302
+ {
303
+ return make_cuFloatComplex (x, y);
304
+ }
305
+
306
+ /* float-to-double promotion */
307
+ __host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
308
+ (cuFloatComplex c)
309
+ {
310
+ return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
311
+ }
312
+
313
+ __host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
314
+ (cuDoubleComplex c)
315
+ {
316
+ return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
317
+ }
318
+
319
+
320
+ __host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
321
+ {
322
+ float real_res;
323
+ float imag_res;
324
+
325
+ real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
326
+ imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);
327
+
328
+ real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
329
+ imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;
330
+
331
+ return make_cuComplex(real_res, imag_res);
332
+ }
333
+
334
+ __host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
335
+ {
336
+ double real_res;
337
+ double imag_res;
338
+
339
+ real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
340
+ imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);
341
+
342
+ real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
343
+ imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;
344
+
345
+ return make_cuDoubleComplex(real_res, imag_res);
346
+ }
347
+
348
+ #endif /* !defined(CU_COMPLEX_H_) */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CUDA_AWBARRIER_H_
51
+ # define _CUDA_AWBARRIER_H_
52
+
53
+ # include "cuda_awbarrier_primitives.h"
54
+
55
+ # if !defined(_CUDA_AWBARRIER_SM_TARGET)
56
+ # error This file requires compute capability 7.0 or greater.
57
+ # endif
58
+
59
+ # if !defined(_CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER)
60
+ # error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
61
+ -std=c++11 compiler option.
62
+ # endif
63
+
64
+ _CUDA_AWBARRIER_BEGIN_NAMESPACE
65
+
66
+ class awbarrier {
67
+ public:
68
+ class arrival_token {
69
+ public:
70
+ arrival_token() = default;
71
+ ~arrival_token() = default;
72
+ _CUDA_AWBARRIER_QUALIFIER uint32_t pending_count() const;
73
+ private:
74
+ _CUDA_AWBARRIER_QUALIFIER arrival_token(uint64_t token);
75
+ uint64_t token;
76
+ friend awbarrier;
77
+ };
78
+ awbarrier() = default;
79
+ awbarrier(const awbarrier&) = delete;
80
+ awbarrier& operator=(const awbarrier&) = delete;
81
+ ~awbarrier() = default;
82
+
83
+ _CUDA_AWBARRIER_QUALIFIER arrival_token arrive();
84
+ _CUDA_AWBARRIER_QUALIFIER arrival_token arrive_and_drop();
85
+ _CUDA_AWBARRIER_QUALIFIER bool timed_wait(arrival_token token, uint32_t hint_cycles);
86
+ _CUDA_AWBARRIER_QUALIFIER void wait(arrival_token token);
87
+ _CUDA_AWBARRIER_QUALIFIER void arrive_and_wait();
88
+ _CUDA_AWBARRIER_STATIC_QUALIFIER __host__ constexpr uint32_t max();
89
+ private:
90
+ uint64_t barrier;
91
+ friend _CUDA_AWBARRIER_QUALIFIER void init(awbarrier* barrier, uint32_t expected_count);
92
+ friend _CUDA_AWBARRIER_QUALIFIER void inval(awbarrier* barrier);
93
+ friend class pipeline;
94
+ };
95
+
96
+ _CUDA_AWBARRIER_QUALIFIER
97
+ uint32_t awbarrier::arrival_token::pending_count() const
98
+ {
99
+ const uint32_t pending_count = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(this->token);
100
+ #if (__CUDA_ARCH__ >= 900)
101
+ return pending_count;
102
+ #else
103
+ return (pending_count >> 15);
104
+ #endif
105
+ }
106
+
107
+ _CUDA_AWBARRIER_QUALIFIER
108
+ awbarrier::arrival_token::arrival_token(uint64_t token)
109
+ : token(token)
110
+ {
111
+ }
112
+
113
+ _CUDA_AWBARRIER_QUALIFIER
114
+ void init(awbarrier* barrier, uint32_t expected_count)
115
+ {
116
+ _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
117
+ _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT);
118
+
119
+ #if (__CUDA_ARCH__ >= 900)
120
+ const uint32_t init_count = expected_count;
121
+ #else
122
+ const uint32_t init_count = (expected_count << 15) + expected_count;
123
+ #endif
124
+
125
+ _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(&barrier->barrier, init_count);
126
+ }
127
+
128
+ _CUDA_AWBARRIER_QUALIFIER
129
+ void inval(awbarrier* barrier)
130
+ {
131
+ _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
132
+
133
+ _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(&barrier->barrier);
134
+ }
135
+
136
+ _CUDA_AWBARRIER_QUALIFIER
137
+ awbarrier::arrival_token awbarrier::arrive()
138
+ {
139
+ _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
140
+
141
+ #if (__CUDA_ARCH__ < 900)
142
+ const uint32_t arrive_count = 1 << 15;
143
+ const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<false>(&this->barrier, arrive_count);
144
+ (void)
145
+ #else
146
+ const uint64_t token =
147
+ #endif
148
+ _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(&this->barrier);
149
+
150
+ return arrival_token(token);
151
+ }
152
+
153
+ _CUDA_AWBARRIER_QUALIFIER
154
+ awbarrier::arrival_token awbarrier::arrive_and_drop()
155
+ {
156
+ _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
157
+
158
+ #if (__CUDA_ARCH__ < 900)
159
+ const uint32_t arrive_count = 1 << 15;
160
+ const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<true>(&this->barrier, arrive_count);
161
+ (void)
162
+ #else
163
+ const uint64_t token =
164
+ #endif
165
+ _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(&this->barrier);
166
+
167
+ return arrival_token(token);
168
+ }
169
+
170
+ _CUDA_AWBARRIER_QUALIFIER
171
+ bool awbarrier::timed_wait(arrival_token token, uint32_t hint_cycles)
172
+ {
173
+ constexpr uint64_t max_busy_wait_cycles = 1024;
174
+ constexpr uint32_t max_sleep_ns = 1 << 20;
175
+
176
+ _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
177
+
178
+ if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
179
+ return true;
180
+ }
181
+
182
+ uint64_t start_cycles = clock64();
183
+ uint64_t elapsed_cycles = 0;
184
+ uint32_t sleep_ns = 32;
185
+ while (elapsed_cycles < hint_cycles) {
186
+ if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
187
+ return true;
188
+ }
189
+
190
+ if (elapsed_cycles > max_busy_wait_cycles) {
191
+ __nanosleep(sleep_ns);
192
+ if (sleep_ns < max_sleep_ns) {
193
+ sleep_ns *= 2;
194
+ }
195
+ }
196
+
197
+ elapsed_cycles = clock64() - start_cycles;
198
+ }
199
+
200
+ return false;
201
+ }
202
+
203
+ _CUDA_AWBARRIER_QUALIFIER
204
+ void awbarrier::wait(arrival_token token)
205
+ {
206
+ _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
207
+
208
+ while (!timed_wait(token, ~0u));
209
+ }
210
+
211
+ _CUDA_AWBARRIER_QUALIFIER
212
+ void awbarrier::arrive_and_wait()
213
+ {
214
+ _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
215
+
216
+ this->wait(this->arrive());
217
+ }
218
+
219
+ _CUDA_AWBARRIER_QUALIFIER __host__
220
+ constexpr uint32_t awbarrier::max()
221
+ {
222
+ return _CUDA_AWBARRIER_MAX_COUNT;
223
+ }
224
+
225
+ _CUDA_AWBARRIER_END_NAMESPACE
226
+
227
+ #endif /* !_CUDA_AWBARRIER_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CUDA_AWBARRIER_HELPERS_H_
51
+ #define _CUDA_AWBARRIER_HELPERS_H_
52
+
53
+ #define _CUDA_AWBARRIER_NAMESPACE nvcuda::experimental
54
+ #define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
55
+ #define _CUDA_AWBARRIER_END_NAMESPACE } }
56
+
57
+ #define _CUDA_AWBARRIER_INTERNAL_NAMESPACE _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal
58
+ #define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal {
59
+ #define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE } _CUDA_AWBARRIER_END_NAMESPACE
60
+
61
+ # if !defined(_CUDA_AWBARRIER_QUALIFIER)
62
+ # define _CUDA_AWBARRIER_QUALIFIER inline __device__
63
+ # endif
64
+ # if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER)
65
+ # define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__
66
+ #endif
67
+
68
+ #if defined(__CUDA_ARCH__)
69
+ #if (__CUDA_ARCH__ >= 800)
70
+ # define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_80
71
+ #elif (__CUDA_ARCH__ >= 700)
72
+ # define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
73
+ #endif // No support < 700
74
+ #else // !defined(__CUDA_ARCH__)
75
+ # define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
76
+ #endif // defined(__CUDA_ARCH__)
77
+
78
+ #define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1)
79
+
80
+ #if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
81
+ # define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER
82
+ #endif
83
+
84
+ #if !defined(_CUDA_AWBARRIER_DEBUG)
85
+ # if defined(__CUDACC_DEBUG__)
86
+ # define _CUDA_AWBARRIER_DEBUG 1
87
+ # else
88
+ # define _CUDA_AWBARRIER_DEBUG 0
89
+ # endif
90
+ #endif
91
+
92
+ #if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG)
93
+ # if !defined(__CUDACC_RTC__)
94
+ # include <cassert>
95
+ # endif
96
+ # define _CUDA_AWBARRIER_ASSERT(x) assert((x));
97
+ # define _CUDA_AWBARRIER_ABORT() assert(0);
98
+ #else
99
+ # define _CUDA_AWBARRIER_ASSERT(x)
100
+ # define _CUDA_AWBARRIER_ABORT() __trap();
101
+ #endif
102
+
103
+ #if defined(__CUDACC_RTC__)
104
+ typedef unsigned short uint16_t;
105
+ typedef unsigned int uint32_t;
106
+ typedef unsigned long long uint64_t;
107
+ typedef uint64_t uintptr_t;
108
+ #else
109
+ # include <stdint.h>
110
+ #endif
111
+
112
+ #if defined(_CUDA_AWBARRIER_SM_TARGET)
113
+
114
+ typedef uint64_t __mbarrier_t;
115
+ typedef uint64_t __mbarrier_token_t;
116
+
117
+ _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE
118
+
119
+ extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
120
+
121
+ namespace _CUDA_AWBARRIER_SM_70 {
122
+ union AWBarrier {
123
+ struct {
124
+ uint32_t expected;
125
+ uint32_t pending;
126
+ } split;
127
+ uint64_t raw;
128
+ };
129
+
130
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
131
+ void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
132
+ _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
133
+ _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
134
+
135
+ AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
136
+
137
+ awbarrier->split.expected = 0x40000000 - expected_count;
138
+ awbarrier->split.pending = 0x80000000 - expected_count;
139
+ }
140
+
141
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
142
+ void __awbarrier_inval(uint64_t* barrier) {
143
+ _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
144
+ }
145
+
146
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
147
+ uint32_t __awbarrier_token_pending_count(uint64_t token) {
148
+ const uint32_t pending = token >> 32;
149
+ return 0x80000000 - (pending & 0x7fffffff);
150
+ }
151
+
152
+ template<bool _Drop>
153
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
154
+ uint64_t __awbarrier_arrive_drop(uint64_t* barrier) {
155
+ _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
156
+
157
+ AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
158
+
159
+ while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
160
+
161
+ if (_Drop) {
162
+ (void)atomicAdd_block(&awbarrier->split.expected, 1);
163
+ }
164
+
165
+ __threadfence_block();
166
+
167
+ const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1);
168
+ const uint32_t new_pending = old_pending + 1;
169
+ const bool reset = (old_pending ^ new_pending) & 0x80000000;
170
+
171
+ if (reset) {
172
+ __threadfence_block();
173
+
174
+ uint32_t new_expected = *reinterpret_cast<volatile uint32_t*>(&awbarrier->split.expected);
175
+ new_expected &= ~0x40000000;
176
+ if (new_expected & 0x20000000) {
177
+ new_expected |= 0x40000000;
178
+ }
179
+ atomicAdd_block(&awbarrier->split.pending, new_expected);
180
+ }
181
+
182
+ return static_cast<uint64_t>(old_pending) << 32;
183
+ }
184
+
185
+ template<bool _Drop>
186
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
187
+ uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
188
+ _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
189
+ _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
190
+
191
+ AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
192
+
193
+ while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
194
+
195
+ if (_Drop) {
196
+ (void)atomicAdd_block(&awbarrier->split.expected, count);
197
+ }
198
+
199
+ return static_cast<uint64_t>(atomicAdd_block(&awbarrier->split.pending, count)) << 32;
200
+ }
201
+
202
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
203
+ bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
204
+ _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
205
+
206
+ volatile AWBarrier* awbarrier = reinterpret_cast<volatile AWBarrier*>(barrier);
207
+
208
+ return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000;
209
+ }
210
+ }; // namespace _CUDA_AWBARRIER_SM_70
211
+
212
+ namespace _CUDA_AWBARRIER_SM_80 {
213
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
214
+ void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
215
+ _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
216
+ _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
217
+
218
+ asm volatile ("mbarrier.init.shared.b64 [%0], %1;"
219
+ :
220
+ : "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count)
221
+ : "memory");
222
+ }
223
+
224
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
225
+ void __awbarrier_inval(uint64_t* barrier) {
226
+ _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
227
+
228
+ asm volatile ("mbarrier.inval.shared.b64 [%0];"
229
+ :
230
+ : "r"(__nvvm_get_smem_pointer(barrier))
231
+ : "memory");
232
+ }
233
+
234
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
235
+ uint32_t __awbarrier_token_pending_count(uint64_t token) {
236
+ uint32_t __pending_count;
237
+
238
+ asm ("mbarrier.pending_count.b64 %0, %1;"
239
+ : "=r"(__pending_count)
240
+ : "l"(token));
241
+ return __pending_count;
242
+ }
243
+
244
+ template<bool _Drop>
245
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
246
+ uint64_t __awbarrier_arrive_drop(uint64_t* barrier) {
247
+ _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
248
+
249
+ uint64_t token;
250
+
251
+ if (_Drop) {
252
+ asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];"
253
+ : "=l"(token)
254
+ : "r"(__nvvm_get_smem_pointer(barrier))
255
+ : "memory");
256
+ } else {
257
+ asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
258
+ : "=l"(token)
259
+ : "r"(__nvvm_get_smem_pointer(barrier))
260
+ : "memory");
261
+ }
262
+
263
+ return token;
264
+ }
265
+
266
+ template<bool _Drop>
267
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
268
+ uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
269
+ _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
270
+ _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
271
+
272
+ uint64_t token;
273
+
274
+ if (_Drop) {
275
+ asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;"
276
+ : "=l"(token)
277
+ : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
278
+ : "memory");
279
+ } else {
280
+ asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
281
+ : "=l"(token)
282
+ : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
283
+ : "memory");
284
+ }
285
+
286
+ return token;
287
+ }
288
+
289
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
290
+ bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
291
+ _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
292
+
293
+ uint16_t __wait_complete;
294
+
295
+ asm volatile ("{"
296
+ " .reg .pred %%p;"
297
+ " mbarrier.test_wait.shared.b64 %%p, [%1], %2;"
298
+ " selp.u16 %0, 1, 0, %%p;"
299
+ "}"
300
+ : "=h"(__wait_complete)
301
+ : "r"(__nvvm_get_smem_pointer(barrier)), "l"(token)
302
+ : "memory");
303
+ return bool(__wait_complete);
304
+ }
305
+
306
+ }; // namespace _CUDA_AWBARRIER_SM_80
307
+
308
+ _CUDA_AWBARRIER_QUALIFIER
309
+ void awbarrier_init(uint64_t* barrier, uint32_t expected_count)
310
+ {
311
+ _CUDA_AWBARRIER_SM_TARGET::__awbarrier_init(barrier, expected_count);
312
+ }
313
+
314
+ _CUDA_AWBARRIER_QUALIFIER
315
+ void awbarrier_inval(uint64_t* barrier)
316
+ {
317
+ _CUDA_AWBARRIER_SM_TARGET::__awbarrier_inval(barrier);
318
+ }
319
+
320
+ _CUDA_AWBARRIER_QUALIFIER
321
+ uint32_t awbarrier_token_pending_count(uint64_t token)
322
+ {
323
+ return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_token_pending_count(token);
324
+ }
325
+
326
+ template<bool _Drop>
327
+ _CUDA_AWBARRIER_QUALIFIER
328
+ uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t arrive_count)
329
+ {
330
+ return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop_no_complete<_Drop>(barrier, arrive_count);
331
+ }
332
+
333
+ template<bool _Drop>
334
+ _CUDA_AWBARRIER_QUALIFIER
335
+ uint64_t awbarrier_arrive_drop(uint64_t* barrier)
336
+ {
337
+ return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop<_Drop>(barrier);
338
+ }
339
+
340
+ _CUDA_AWBARRIER_QUALIFIER
341
+ bool awbarrier_test_wait(uint64_t* barrier, uint64_t token)
342
+ {
343
+ return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_test_wait(barrier, token);
344
+ }
345
+
346
+ _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE
347
+
348
+ #endif /* defined(_CUDA_AWBARRIER_SM_TARGET) */
349
+
350
+ #endif /* !_CUDA_AWBARRIER_HELPERS_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CUDA_AWBARRIER_PRIMITIVES_H_
51
+ #define _CUDA_AWBARRIER_PRIMITIVES_H_
52
+
53
+ #include "cuda_awbarrier_helpers.h"
54
+
55
+ #if !defined(_CUDA_AWBARRIER_SM_TARGET)
56
+ # error This file requires compute capability 7.0 or greater.
57
+ #endif
58
+
59
+ _CUDA_AWBARRIER_STATIC_QUALIFIER __host__
60
+ uint32_t __mbarrier_maximum_count() {
61
+ return _CUDA_AWBARRIER_MAX_COUNT;
62
+ }
63
+
64
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
65
+ void __mbarrier_init(__mbarrier_t* barrier, uint32_t expected_count) {
66
+ _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(barrier, expected_count);
67
+ }
68
+
69
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
70
+ void __mbarrier_inval(__mbarrier_t* barrier) {
71
+ _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(barrier);
72
+ }
73
+
74
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
75
+ __mbarrier_token_t __mbarrier_arrive(__mbarrier_t* barrier) {
76
+ return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(barrier);
77
+ }
78
+
79
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
80
+ __mbarrier_token_t __mbarrier_arrive_and_drop(__mbarrier_t* barrier) {
81
+ return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(barrier);
82
+ }
83
+
84
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
85
+ bool __mbarrier_test_wait(__mbarrier_t* barrier, __mbarrier_token_t token) {
86
+ return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(barrier, token);
87
+ }
88
+
89
+ _CUDA_AWBARRIER_STATIC_QUALIFIER
90
+ uint32_t __mbarrier_token_pending_count(__mbarrier_token_t token) {
91
+ return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(token);
92
+ }
93
+
94
+ #endif /* !_CUDA_AWBARRIER_PRIMITIVES_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp ADDED
The diff for this file is too large to render. See raw diff
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h ADDED
@@ -0,0 +1,1958 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /**
51
+ * CUDA Occupancy Calculator
52
+ *
53
+ * NAME
54
+ *
55
+ * cudaOccMaxActiveBlocksPerMultiprocessor,
56
+ * cudaOccMaxPotentialOccupancyBlockSize,
57
+ * cudaOccMaxPotentialOccupancyBlockSizeVariableSMem
58
+ * cudaOccAvailableDynamicSMemPerBlock
59
+ *
60
+ * DESCRIPTION
61
+ *
62
+ * The CUDA occupancy calculator provides a standalone, programmatical
63
+ * interface to compute the occupancy of a function on a device. It can also
64
+ * provide occupancy-oriented launch configuration suggestions.
65
+ *
66
+ * The function and device are defined by the user through
67
+ * cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState
68
+ * structures. All APIs require all 3 of them.
69
+ *
70
+ * See the structure definition for more details about the device / function
71
+ * descriptors.
72
+ *
73
+ * See each API's prototype for API usage.
74
+ *
75
+ * COMPATIBILITY
76
+ *
77
+ * The occupancy calculator will be updated on each major CUDA toolkit
78
+ * release. It does not provide forward compatibility, i.e. new hardwares
79
+ * released after this implementation's release will not be supported.
80
+ *
81
+ * NOTE
82
+ *
83
+ * If there is access to CUDA runtime, and the sole intent is to calculate
84
+ * occupancy related values on one of the accessible CUDA devices, using CUDA
85
+ * runtime's occupancy calculation APIs is recommended.
86
+ *
87
+ */
88
+
89
+ #ifndef __cuda_occupancy_h__
90
+ #define __cuda_occupancy_h__
91
+
92
+ #include <stddef.h>
93
+ #include <limits.h>
94
+ #include <string.h>
95
+
96
+
97
+ // __OCC_INLINE will be undefined at the end of this header
98
+ //
99
+ #ifdef __CUDACC__
100
+ #define __OCC_INLINE inline __host__ __device__
101
+ #elif defined _MSC_VER
102
+ #define __OCC_INLINE __inline
103
+ #else // GNUCC assumed
104
+ #define __OCC_INLINE inline
105
+ #endif
106
+
107
+ enum cudaOccError_enum {
108
+ CUDA_OCC_SUCCESS = 0, // no error encountered
109
+ CUDA_OCC_ERROR_INVALID_INPUT = 1, // input parameter is invalid
110
+ CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2, // requested device is not supported in
111
+ // current implementation or device is
112
+ // invalid
113
+ };
114
+ typedef enum cudaOccError_enum cudaOccError;
115
+
116
+ typedef struct cudaOccResult cudaOccResult;
117
+ typedef struct cudaOccDeviceProp cudaOccDeviceProp;
118
+ typedef struct cudaOccFuncAttributes cudaOccFuncAttributes;
119
+ typedef struct cudaOccDeviceState cudaOccDeviceState;
120
+
121
+ /**
122
+ * The CUDA occupancy calculator computes the occupancy of the function
123
+ * described by attributes with the given block size (blockSize), static device
124
+ * properties (properties), dynamic device states (states) and per-block dynamic
125
+ * shared memory allocation (dynamicSMemSize) in bytes, and output it through
126
+ * result along with other useful information. The occupancy is computed in
127
+ * terms of the maximum number of active blocks per multiprocessor. The user can
128
+ * then convert it to other metrics, such as number of active warps.
129
+ *
130
+ * RETURN VALUE
131
+ *
132
+ * The occupancy and related information is returned through result.
133
+ *
134
+ * If result->activeBlocksPerMultiprocessor is 0, then the given parameter
135
+ * combination cannot run on the device.
136
+ *
137
+ * ERRORS
138
+ *
139
+ * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
140
+ * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
141
+ * current implementation or device is invalid
142
+ */
143
+ static __OCC_INLINE
144
+ cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
145
+ cudaOccResult *result, // out
146
+ const cudaOccDeviceProp *properties, // in
147
+ const cudaOccFuncAttributes *attributes, // in
148
+ const cudaOccDeviceState *state, // in
149
+ int blockSize, // in
150
+ size_t dynamicSmemSize); // in
151
+
152
+ /**
153
+ * The CUDA launch configurator C API suggests a grid / block size pair (in
154
+ * minGridSize and blockSize) that achieves the best potential occupancy
155
+ * (i.e. maximum number of active warps with the smallest number of blocks) for
156
+ * the given function described by attributes, on a device described by
157
+ * properties with settings in state.
158
+ *
159
+ * If per-block dynamic shared memory allocation is not needed, the user should
160
+ * leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0.
161
+ *
162
+ * If per-block dynamic shared memory allocation is needed, then if the dynamic
163
+ * shared memory size is constant regardless of block size, the size should be
164
+ * passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be
165
+ * NULL.
166
+ *
167
+ * Otherwise, if the per-block dynamic shared memory size varies with different
168
+ * block sizes, the user needs to provide a pointer to an unary function through
169
+ * blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by
170
+ * a block of the function for any given block size. dynamicSMemSize is
171
+ * ignored. An example signature is:
172
+ *
173
+ * // Take block size, returns dynamic shared memory needed
174
+ * size_t blockToSmem(int blockSize);
175
+ *
176
+ * RETURN VALUE
177
+ *
178
+ * The suggested block size and the minimum number of blocks needed to achieve
179
+ * the maximum occupancy are returned through blockSize and minGridSize.
180
+ *
181
+ * If *blockSize is 0, then the given combination cannot run on the device.
182
+ *
183
+ * ERRORS
184
+ *
185
+ * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
186
+ * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
187
+ * current implementation or device is invalid
188
+ *
189
+ */
190
+ static __OCC_INLINE
191
+ cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
192
+ int *minGridSize, // out
193
+ int *blockSize, // out
194
+ const cudaOccDeviceProp *properties, // in
195
+ const cudaOccFuncAttributes *attributes, // in
196
+ const cudaOccDeviceState *state, // in
197
+ size_t (*blockSizeToDynamicSMemSize)(int), // in
198
+ size_t dynamicSMemSize); // in
199
+
200
+ /**
201
+ * The CUDA launch configurator C++ API suggests a grid / block size pair (in
202
+ * minGridSize and blockSize) that achieves the best potential occupancy
203
+ * (i.e. the maximum number of active warps with the smallest number of blocks)
204
+ * for the given function described by attributes, on a device described by
205
+ * properties with settings in state.
206
+ *
207
+ * If per-block dynamic shared memory allocation is 0 or constant regardless of
208
+ * block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to
209
+ * configure the launch. A constant dynamic shared memory allocation size in
210
+ * bytes can be passed through dynamicSMemSize.
211
+ *
212
+ * Otherwise, if the per-block dynamic shared memory size varies with different
213
+ * block sizes, the user needs to use
214
+ * cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a
215
+ * functor / pointer to an unary function (blockSizeToDynamicSMemSize) that
216
+ * computes the dynamic shared memory needed by func for any given block
217
+ * size. An example signature is:
218
+ *
219
+ * // Take block size, returns per-block dynamic shared memory needed
220
+ * size_t blockToSmem(int blockSize);
221
+ *
222
+ * RETURN VALUE
223
+ *
224
+ * The suggested block size and the minimum number of blocks needed to achieve
225
+ * the maximum occupancy are returned through blockSize and minGridSize.
226
+ *
227
+ * If *blockSize is 0, then the given combination cannot run on the device.
228
+ *
229
+ * ERRORS
230
+ *
231
+ * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
232
+ * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
233
+ * current implementation or device is invalid
234
+ *
235
+ */
236
+
237
+ #if defined(__cplusplus)
238
+ namespace {
239
+
240
+ __OCC_INLINE
241
+ cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
242
+ int *minGridSize, // out
243
+ int *blockSize, // out
244
+ const cudaOccDeviceProp *properties, // in
245
+ const cudaOccFuncAttributes *attributes, // in
246
+ const cudaOccDeviceState *state, // in
247
+ size_t dynamicSMemSize = 0); // in
248
+
249
+ template <typename UnaryFunction>
250
+ __OCC_INLINE
251
+ cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
252
+ int *minGridSize, // out
253
+ int *blockSize, // out
254
+ const cudaOccDeviceProp *properties, // in
255
+ const cudaOccFuncAttributes *attributes, // in
256
+ const cudaOccDeviceState *state, // in
257
+ UnaryFunction blockSizeToDynamicSMemSize); // in
258
+
259
+ } // namespace anonymous
260
+ #endif // defined(__cplusplus)
261
+
262
+ /**
263
+ *
264
+ * The CUDA dynamic shared memory calculator computes the maximum size of
265
+ * per-block dynamic shared memory if we want to place numBlocks blocks
266
+ * on an SM.
267
+ *
268
+ * RETURN VALUE
269
+ *
270
+ * Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow
271
+ * numBlocks blocks per SM.
272
+ *
273
+ * ERRORS
274
+ *
275
+ * CUDA_OCC_ERROR_INVALID_INPUT input parameter is invalid.
276
+ * CUDA_OCC_ERROR_UNKNOWN_DEVICE requested device is not supported in
277
+ * current implementation or device is invalid
278
+ *
279
+ */
280
+ static __OCC_INLINE
281
+ cudaOccError cudaOccAvailableDynamicSMemPerBlock(
282
+ size_t *dynamicSmemSize,
283
+ const cudaOccDeviceProp *properties,
284
+ const cudaOccFuncAttributes *attributes,
285
+ const cudaOccDeviceState *state,
286
+ int numBlocks,
287
+ int blockSize);
288
+
289
+ /**
290
+ * Data structures
291
+ *
292
+ * These structures are subject to change for future architecture and CUDA
293
+ * releases. C users should initialize the structure as {0}.
294
+ *
295
+ */
296
+
297
+ /**
298
+ * Device descriptor
299
+ *
300
+ * This structure describes a device.
301
+ */
302
+ struct cudaOccDeviceProp {
303
+ int computeMajor; // Compute capability major version
304
+ int computeMinor; // Compute capability minor
305
+ // version. None supported minor version
306
+ // may cause error
307
+ int maxThreadsPerBlock; // Maximum number of threads per block
308
+ int maxThreadsPerMultiprocessor; // Maximum number of threads per SM
309
+ // i.e. (Max. number of warps) x (warp
310
+ // size)
311
+ int regsPerBlock; // Maximum number of registers per block
312
+ int regsPerMultiprocessor; // Maximum number of registers per SM
313
+ int warpSize; // Warp size
314
+ size_t sharedMemPerBlock; // Maximum shared memory size per block
315
+ size_t sharedMemPerMultiprocessor; // Maximum shared memory size per SM
316
+ int numSms; // Number of SMs available
317
+ size_t sharedMemPerBlockOptin; // Maximum optin shared memory size per block
318
+ size_t reservedSharedMemPerBlock; // Shared memory per block reserved by driver
319
+
320
+ #ifdef __cplusplus
321
+ // This structure can be converted from a cudaDeviceProp structure for users
322
+ // that use this header in their CUDA applications.
323
+ //
324
+ // If the application have access to the CUDA Runtime API, the application
325
+ // can obtain the device properties of a CUDA device through
326
+ // cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the
327
+ // cudaDeviceProp structure.
328
+ //
329
+ // Example:
330
+ /*
331
+ {
332
+ cudaDeviceProp prop;
333
+
334
+ cudaGetDeviceProperties(&prop, ...);
335
+
336
+ cudaOccDeviceProp occProp = prop;
337
+
338
+ ...
339
+
340
+ cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...);
341
+ }
342
+ */
343
+ //
344
+ template<typename DeviceProp>
345
+ __OCC_INLINE
346
+ cudaOccDeviceProp(const DeviceProp &props)
347
+ : computeMajor (props.major),
348
+ computeMinor (props.minor),
349
+ maxThreadsPerBlock (props.maxThreadsPerBlock),
350
+ maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor),
351
+ regsPerBlock (props.regsPerBlock),
352
+ regsPerMultiprocessor (props.regsPerMultiprocessor),
353
+ warpSize (props.warpSize),
354
+ sharedMemPerBlock (props.sharedMemPerBlock),
355
+ sharedMemPerMultiprocessor (props.sharedMemPerMultiprocessor),
356
+ numSms (props.multiProcessorCount),
357
+ sharedMemPerBlockOptin (props.sharedMemPerBlockOptin),
358
+ reservedSharedMemPerBlock (props.reservedSharedMemPerBlock)
359
+ {}
360
+
361
+ __OCC_INLINE
362
+ cudaOccDeviceProp()
363
+ : computeMajor (0),
364
+ computeMinor (0),
365
+ maxThreadsPerBlock (0),
366
+ maxThreadsPerMultiprocessor (0),
367
+ regsPerBlock (0),
368
+ regsPerMultiprocessor (0),
369
+ warpSize (0),
370
+ sharedMemPerBlock (0),
371
+ sharedMemPerMultiprocessor (0),
372
+ numSms (0),
373
+ sharedMemPerBlockOptin (0),
374
+ reservedSharedMemPerBlock (0)
375
+ {}
376
+ #endif // __cplusplus
377
+ };
378
+
379
+ /**
380
+ * Partitioned global caching option
381
+ */
382
+ typedef enum cudaOccPartitionedGCConfig_enum {
383
+ PARTITIONED_GC_OFF, // Disable partitioned global caching
384
+ PARTITIONED_GC_ON, // Prefer partitioned global caching
385
+ PARTITIONED_GC_ON_STRICT // Force partitioned global caching
386
+ } cudaOccPartitionedGCConfig;
387
+
388
+ /**
389
+ * Per function opt in maximum dynamic shared memory limit
390
+ */
391
+ typedef enum cudaOccFuncShmemConfig_enum {
392
+ FUNC_SHMEM_LIMIT_DEFAULT, // Default shmem limit
393
+ FUNC_SHMEM_LIMIT_OPTIN, // Use the optin shmem limit
394
+ } cudaOccFuncShmemConfig;
395
+
396
+ /**
397
+ * Function descriptor
398
+ *
399
+ * This structure describes a CUDA function.
400
+ */
401
+ struct cudaOccFuncAttributes {
402
+ int maxThreadsPerBlock; // Maximum block size the function can work with. If
403
+ // unlimited, use INT_MAX or any value greater than
404
+ // or equal to maxThreadsPerBlock of the device
405
+ int numRegs; // Number of registers used. When the function is
406
+ // launched on device, the register count may change
407
+ // due to internal tools requirements.
408
+ size_t sharedSizeBytes; // Number of static shared memory used
409
+
410
+ cudaOccPartitionedGCConfig partitionedGCConfig;
411
+ // Partitioned global caching is required to enable
412
+ // caching on certain chips, such as sm_52
413
+ // devices. Partitioned global caching can be
414
+ // automatically disabled if the occupancy
415
+ // requirement of the launch cannot support caching.
416
+ //
417
+ // To override this behavior with caching on and
418
+ // calculate occupancy strictly according to the
419
+ // preference, set partitionedGCConfig to
420
+ // PARTITIONED_GC_ON_STRICT. This is especially
421
+ // useful for experimenting and finding launch
422
+ // configurations (MaxPotentialOccupancyBlockSize)
423
+ // that allow global caching to take effect.
424
+ //
425
+ // This flag only affects the occupancy calculation.
426
+
427
+ cudaOccFuncShmemConfig shmemLimitConfig;
428
+ // Certain chips like sm_70 allow a user to opt into
429
+ // a higher per block limit of dynamic shared memory
430
+ // This optin is performed on a per function basis
431
+ // using the cuFuncSetAttribute function
432
+
433
+ size_t maxDynamicSharedSizeBytes;
434
+ // User set limit on maximum dynamic shared memory
435
+ // usable by the kernel
436
+ // This limit is set using the cuFuncSetAttribute
437
+ // function.
438
+
439
+ int numBlockBarriers; // Number of block barriers used (default to 1)
440
+ #ifdef __cplusplus
441
+ // This structure can be converted from a cudaFuncAttributes structure for
442
+ // users that use this header in their CUDA applications.
443
+ //
444
+ // If the application have access to the CUDA Runtime API, the application
445
+ // can obtain the function attributes of a CUDA kernel function through
446
+ // cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the
447
+ // cudaFuncAttributes structure.
448
+ //
449
+ // Example:
450
+ /*
451
+ __global__ void foo() {...}
452
+
453
+ ...
454
+
455
+ {
456
+ cudaFuncAttributes attr;
457
+
458
+ cudaFuncGetAttributes(&attr, foo);
459
+
460
+ cudaOccFuncAttributes occAttr = attr;
461
+
462
+ ...
463
+
464
+ cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...);
465
+ }
466
+ */
467
+ //
468
+ template<typename FuncAttributes>
469
+ __OCC_INLINE
470
+ cudaOccFuncAttributes(const FuncAttributes &attr)
471
+ : maxThreadsPerBlock (attr.maxThreadsPerBlock),
472
+ numRegs (attr.numRegs),
473
+ sharedSizeBytes (attr.sharedSizeBytes),
474
+ partitionedGCConfig (PARTITIONED_GC_OFF),
475
+ shmemLimitConfig (FUNC_SHMEM_LIMIT_OPTIN),
476
+ maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes),
477
+ numBlockBarriers (1)
478
+ {}
479
+
480
+ __OCC_INLINE
481
+ cudaOccFuncAttributes()
482
+ : maxThreadsPerBlock (0),
483
+ numRegs (0),
484
+ sharedSizeBytes (0),
485
+ partitionedGCConfig (PARTITIONED_GC_OFF),
486
+ shmemLimitConfig (FUNC_SHMEM_LIMIT_DEFAULT),
487
+ maxDynamicSharedSizeBytes (0),
488
+ numBlockBarriers (0)
489
+ {}
490
+ #endif
491
+ };
492
+
493
+ typedef enum cudaOccCacheConfig_enum {
494
+ CACHE_PREFER_NONE = 0x00, // no preference for shared memory or L1 (default)
495
+ CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache
496
+ CACHE_PREFER_L1 = 0x02, // prefer larger L1 cache and smaller shared memory
497
+ CACHE_PREFER_EQUAL = 0x03 // prefer equal sized L1 cache and shared memory
498
+ } cudaOccCacheConfig;
499
+
500
+ typedef enum cudaOccCarveoutConfig_enum {
501
+ SHAREDMEM_CARVEOUT_DEFAULT = -1, // no preference for shared memory or L1 (default)
502
+ SHAREDMEM_CARVEOUT_MAX_SHARED = 100, // prefer maximum available shared memory, minimum L1 cache
503
+ SHAREDMEM_CARVEOUT_MAX_L1 = 0, // prefer maximum available L1 cache, minimum shared memory
504
+ SHAREDMEM_CARVEOUT_HALF = 50 // prefer half of maximum available shared memory, with the rest as L1 cache
505
+ } cudaOccCarveoutConfig;
506
+
507
+ /**
508
+ * Device state descriptor
509
+ *
510
+ * This structure describes device settings that affect occupancy calculation.
511
+ */
512
+ struct cudaOccDeviceState
513
+ {
514
+ // Cache / shared memory split preference. Deprecated on Volta
515
+ cudaOccCacheConfig cacheConfig;
516
+ // Shared memory / L1 split preference. Supported on only Volta
517
+ int carveoutConfig;
518
+
519
+ #ifdef __cplusplus
520
+ __OCC_INLINE
521
+ cudaOccDeviceState()
522
+ : cacheConfig (CACHE_PREFER_NONE),
523
+ carveoutConfig (SHAREDMEM_CARVEOUT_DEFAULT)
524
+ {}
525
+ #endif
526
+ };
527
+
528
+ typedef enum cudaOccLimitingFactor_enum {
529
+ // Occupancy limited due to:
530
+ OCC_LIMIT_WARPS = 0x01, // - warps available
531
+ OCC_LIMIT_REGISTERS = 0x02, // - registers available
532
+ OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available
533
+ OCC_LIMIT_BLOCKS = 0x08, // - blocks available
534
+ OCC_LIMIT_BARRIERS = 0x10 // - barrier available
535
+ } cudaOccLimitingFactor;
536
+
537
+ /**
538
+ * Occupancy output
539
+ *
540
+ * This structure contains occupancy calculator's output.
541
+ */
542
+ struct cudaOccResult {
543
+ int activeBlocksPerMultiprocessor; // Occupancy
544
+ unsigned int limitingFactors; // Factors that limited occupancy. A bit
545
+ // field that counts the limiting
546
+ // factors, see cudaOccLimitingFactor
547
+ int blockLimitRegs; // Occupancy due to register
548
+ // usage, INT_MAX if the kernel does not
549
+ // use any register.
550
+ int blockLimitSharedMem; // Occupancy due to shared memory
551
+ // usage, INT_MAX if the kernel does not
552
+ // use shared memory.
553
+ int blockLimitWarps; // Occupancy due to block size limit
554
+ int blockLimitBlocks; // Occupancy due to maximum number of blocks
555
+ // managable per SM
556
+ int blockLimitBarriers; // Occupancy due to block barrier usage
557
+ int allocatedRegistersPerBlock; // Actual number of registers allocated per
558
+ // block
559
+ size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated
560
+ // per block
561
+ cudaOccPartitionedGCConfig partitionedGCConfig;
562
+ // Report if partitioned global caching
563
+ // is actually enabled.
564
+ };
565
+
566
+ /**
567
+ * Partitioned global caching support
568
+ *
569
+ * See cudaOccPartitionedGlobalCachingModeSupport
570
+ */
571
+ typedef enum cudaOccPartitionedGCSupport_enum {
572
+ PARTITIONED_GC_NOT_SUPPORTED, // Partitioned global caching is not supported
573
+ PARTITIONED_GC_SUPPORTED, // Partitioned global caching is supported
574
+ } cudaOccPartitionedGCSupport;
575
+
576
+ /**
577
+ * Implementation
578
+ */
579
+
580
+ /**
581
+ * Max compute capability supported
582
+ */
583
+ #define __CUDA_OCC_MAJOR__ 9
584
+ #define __CUDA_OCC_MINOR__ 0
585
+
586
+ //////////////////////////////////////////
587
+ // Mathematical Helper Functions //
588
+ //////////////////////////////////////////
589
+
590
+ static __OCC_INLINE int __occMin(int lhs, int rhs)
591
+ {
592
+ return rhs < lhs ? rhs : lhs;
593
+ }
594
+
595
+ static __OCC_INLINE int __occDivideRoundUp(int x, int y)
596
+ {
597
+ return (x + (y - 1)) / y;
598
+ }
599
+
600
+ static __OCC_INLINE int __occRoundUp(int x, int y)
601
+ {
602
+ return y * __occDivideRoundUp(x, y);
603
+ }
604
+
605
+ //////////////////////////////////////////
606
+ // Architectural Properties //
607
+ //////////////////////////////////////////
608
+
609
+ /**
610
+ * Granularity of shared memory allocation
611
+ */
612
+ static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
613
+ {
614
+ int value;
615
+
616
+ switch(properties->computeMajor) {
617
+ case 3:
618
+ case 5:
619
+ case 6:
620
+ case 7:
621
+ value = 256;
622
+ break;
623
+ case 8:
624
+ case 9:
625
+ value = 128;
626
+ break;
627
+ default:
628
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
629
+ }
630
+
631
+ *limit = value;
632
+
633
+ return CUDA_OCC_SUCCESS;
634
+ }
635
+
636
+ /**
637
+ * Maximum number of registers per thread
638
+ */
639
+ static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties)
640
+ {
641
+ int value;
642
+
643
+ switch(properties->computeMajor) {
644
+ case 3:
645
+ case 5:
646
+ case 6:
647
+ value = 255;
648
+ break;
649
+ case 7:
650
+ case 8:
651
+ case 9:
652
+ value = 256;
653
+ break;
654
+ default:
655
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
656
+ }
657
+
658
+ *limit = value;
659
+
660
+ return CUDA_OCC_SUCCESS;
661
+ }
662
+
663
+ /**
664
+ * Granularity of register allocation
665
+ */
666
+ static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
667
+ {
668
+ int value;
669
+
670
+ switch(properties->computeMajor) {
671
+ case 3:
672
+ case 5:
673
+ case 6:
674
+ case 7:
675
+ case 8:
676
+ case 9:
677
+ value = 256;
678
+ break;
679
+ default:
680
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
681
+ }
682
+
683
+ *limit = value;
684
+
685
+ return CUDA_OCC_SUCCESS;
686
+ }
687
+
688
+ /**
689
+ * Number of sub-partitions
690
+ */
691
+ static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties)
692
+ {
693
+ int value;
694
+
695
+ switch(properties->computeMajor) {
696
+ case 3:
697
+ case 5:
698
+ case 7:
699
+ case 8:
700
+ case 9:
701
+ value = 4;
702
+ break;
703
+ case 6:
704
+ value = properties->computeMinor ? 4 : 2;
705
+ break;
706
+ default:
707
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
708
+ }
709
+
710
+ *limit = value;
711
+
712
+ return CUDA_OCC_SUCCESS;
713
+ }
714
+
715
+
716
+ /**
717
+ * Maximum number of blocks that can run simultaneously on a multiprocessor
718
+ */
719
+ static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties)
720
+ {
721
+ int value;
722
+
723
+ switch(properties->computeMajor) {
724
+ case 3:
725
+ value = 16;
726
+ break;
727
+ case 5:
728
+ case 6:
729
+ value = 32;
730
+ break;
731
+ case 7: {
732
+ int isTuring = properties->computeMinor == 5;
733
+ value = (isTuring) ? 16 : 32;
734
+ break;
735
+ }
736
+ case 8:
737
+ if (properties->computeMinor == 0) {
738
+ value = 32;
739
+ }
740
+ else if (properties->computeMinor == 9) {
741
+ value = 24;
742
+ }
743
+ else {
744
+ value = 16;
745
+ }
746
+ break;
747
+ case 9:
748
+ value = 32;
749
+ break;
750
+ default:
751
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
752
+ }
753
+
754
+ *limit = value;
755
+
756
+ return CUDA_OCC_SUCCESS;
757
+ }
758
+
759
+ /**
760
+ * Align up shared memory based on compute major configurations
761
+ */
762
+ static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties)
763
+ {
764
+ // Volta and Turing have shared L1 cache / shared memory, and support cache
765
+ // configuration to trade one for the other. These values are needed to
766
+ // map carveout config ratio to the next available architecture size
767
+ size_t size = *shMemSize;
768
+
769
+ switch (properties->computeMajor) {
770
+ case 7: {
771
+ // Turing supports 32KB and 64KB shared mem.
772
+ int isTuring = properties->computeMinor == 5;
773
+ if (isTuring) {
774
+ if (size <= 32 * 1024) {
775
+ *shMemSize = 32 * 1024;
776
+ }
777
+ else if (size <= 64 * 1024) {
778
+ *shMemSize = 64 * 1024;
779
+ }
780
+ else {
781
+ return CUDA_OCC_ERROR_INVALID_INPUT;
782
+ }
783
+ }
784
+ // Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem.
785
+ else {
786
+ if (size == 0) {
787
+ *shMemSize = 0;
788
+ }
789
+ else if (size <= 8 * 1024) {
790
+ *shMemSize = 8 * 1024;
791
+ }
792
+ else if (size <= 16 * 1024) {
793
+ *shMemSize = 16 * 1024;
794
+ }
795
+ else if (size <= 32 * 1024) {
796
+ *shMemSize = 32 * 1024;
797
+ }
798
+ else if (size <= 64 * 1024) {
799
+ *shMemSize = 64 * 1024;
800
+ }
801
+ else if (size <= 96 * 1024) {
802
+ *shMemSize = 96 * 1024;
803
+ }
804
+ else {
805
+ return CUDA_OCC_ERROR_INVALID_INPUT;
806
+ }
807
+ }
808
+ break;
809
+ }
810
+ case 8:
811
+ if (properties->computeMinor == 0 || properties->computeMinor == 7) {
812
+ if (size == 0) {
813
+ *shMemSize = 0;
814
+ }
815
+ else if (size <= 8 * 1024) {
816
+ *shMemSize = 8 * 1024;
817
+ }
818
+ else if (size <= 16 * 1024) {
819
+ *shMemSize = 16 * 1024;
820
+ }
821
+ else if (size <= 32 * 1024) {
822
+ *shMemSize = 32 * 1024;
823
+ }
824
+ else if (size <= 64 * 1024) {
825
+ *shMemSize = 64 * 1024;
826
+ }
827
+ else if (size <= 100 * 1024) {
828
+ *shMemSize = 100 * 1024;
829
+ }
830
+ else if (size <= 132 * 1024) {
831
+ *shMemSize = 132 * 1024;
832
+ }
833
+ else if (size <= 164 * 1024) {
834
+ *shMemSize = 164 * 1024;
835
+ }
836
+ else {
837
+ return CUDA_OCC_ERROR_INVALID_INPUT;
838
+ }
839
+ }
840
+ else {
841
+ if (size == 0) {
842
+ *shMemSize = 0;
843
+ }
844
+ else if (size <= 8 * 1024) {
845
+ *shMemSize = 8 * 1024;
846
+ }
847
+ else if (size <= 16 * 1024) {
848
+ *shMemSize = 16 * 1024;
849
+ }
850
+ else if (size <= 32 * 1024) {
851
+ *shMemSize = 32 * 1024;
852
+ }
853
+ else if (size <= 64 * 1024) {
854
+ *shMemSize = 64 * 1024;
855
+ }
856
+ else if (size <= 100 * 1024) {
857
+ *shMemSize = 100 * 1024;
858
+ }
859
+ else {
860
+ return CUDA_OCC_ERROR_INVALID_INPUT;
861
+ }
862
+ }
863
+ break;
864
+ case 9: {
865
+ if (size == 0) {
866
+ *shMemSize = 0;
867
+ }
868
+ else if (size <= 8 * 1024) {
869
+ *shMemSize = 8 * 1024;
870
+ }
871
+ else if (size <= 16 * 1024) {
872
+ *shMemSize = 16 * 1024;
873
+ }
874
+ else if (size <= 32 * 1024) {
875
+ *shMemSize = 32 * 1024;
876
+ }
877
+ else if (size <= 64 * 1024) {
878
+ *shMemSize = 64 * 1024;
879
+ }
880
+ else if (size <= 100 * 1024) {
881
+ *shMemSize = 100 * 1024;
882
+ }
883
+ else if (size <= 132 * 1024) {
884
+ *shMemSize = 132 * 1024;
885
+ }
886
+ else if (size <= 164 * 1024) {
887
+ *shMemSize = 164 * 1024;
888
+ }
889
+ else if (size <= 196 * 1024) {
890
+ *shMemSize = 196 * 1024;
891
+ }
892
+ else if (size <= 228 * 1024) {
893
+ *shMemSize = 228 * 1024;
894
+ }
895
+ else {
896
+ return CUDA_OCC_ERROR_INVALID_INPUT;
897
+ }
898
+ break;
899
+ }
900
+ default:
901
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
902
+ }
903
+
904
+ return CUDA_OCC_SUCCESS;
905
+ }
906
+
907
+ /**
908
+ * Shared memory based on the new carveoutConfig API introduced with Volta
909
+ */
910
+ static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
911
+ {
912
+ cudaOccError status = CUDA_OCC_SUCCESS;
913
+ size_t preferenceShmemSize;
914
+
915
+ // CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported
916
+ // devices. This preference will take precedence over the older cacheConfig setting.
917
+ // Map cacheConfig to its effective preference value.
918
+ int effectivePreference = state->carveoutConfig;
919
+ if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) {
920
+ return CUDA_OCC_ERROR_INVALID_INPUT;
921
+ }
922
+
923
+ if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
924
+ switch (state->cacheConfig)
925
+ {
926
+ case CACHE_PREFER_L1:
927
+ effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1;
928
+ break;
929
+ case CACHE_PREFER_SHARED:
930
+ effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED;
931
+ break;
932
+ case CACHE_PREFER_EQUAL:
933
+ effectivePreference = SHAREDMEM_CARVEOUT_HALF;
934
+ break;
935
+ default:
936
+ effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT;
937
+ break;
938
+ }
939
+ }
940
+
941
+ if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
942
+ preferenceShmemSize = properties->sharedMemPerMultiprocessor;
943
+ }
944
+ else {
945
+ preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100;
946
+ }
947
+
948
+ status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties);
949
+ *limit = preferenceShmemSize;
950
+ return status;
951
+ }
952
+
953
+ /**
954
+ * Shared memory based on the cacheConfig
955
+ */
956
+ static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
957
+ {
958
+ size_t bytes = 0;
959
+ size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor;
960
+ cudaOccCacheConfig cacheConfig = state->cacheConfig;
961
+
962
+ // Kepler has shared L1 cache / shared memory, and support cache
963
+ // configuration to trade one for the other. These values are needed to
964
+ // calculate the correct shared memory size for user requested cache
965
+ // configuration.
966
+ //
967
+ size_t minCacheSize = 16384;
968
+ size_t maxCacheSize = 49152;
969
+ size_t cacheAndSharedTotal = sharedMemPerMultiprocessorHigh + minCacheSize;
970
+ size_t sharedMemPerMultiprocessorLow = cacheAndSharedTotal - maxCacheSize;
971
+
972
+ switch (properties->computeMajor) {
973
+ case 3:
974
+ // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
975
+ // is shared memory.
976
+ //
977
+ switch (cacheConfig) {
978
+ default :
979
+ case CACHE_PREFER_NONE:
980
+ case CACHE_PREFER_SHARED:
981
+ bytes = sharedMemPerMultiprocessorHigh;
982
+ break;
983
+ case CACHE_PREFER_L1:
984
+ bytes = sharedMemPerMultiprocessorLow;
985
+ break;
986
+ case CACHE_PREFER_EQUAL:
987
+ // Equal is the mid-point between high and low. It should be
988
+ // equivalent to low + 16KB.
989
+ //
990
+ bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
991
+ break;
992
+ }
993
+ break;
994
+ case 5:
995
+ case 6:
996
+ // Maxwell and Pascal have dedicated shared memory.
997
+ //
998
+ bytes = sharedMemPerMultiprocessorHigh;
999
+ break;
1000
+ default:
1001
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
1002
+ }
1003
+
1004
+ *limit = bytes;
1005
+
1006
+ return CUDA_OCC_SUCCESS;
1007
+ }
1008
+
1009
+ /**
1010
+ * Shared memory based on config requested by User
1011
+ */
1012
+ static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
1013
+ {
1014
+ // Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference,
1015
+ // it is handled separately from the cache config preference.
1016
+ if (properties->computeMajor >= 7) {
1017
+ return cudaOccSMemPreferenceVoltaPlus(limit, properties, state);
1018
+ }
1019
+ return cudaOccSMemPreference(limit, properties, state);
1020
+ }
1021
+
1022
+ /**
1023
+ * Return the per block shared memory limit based on function config
1024
+ */
1025
+ static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta)
1026
+ {
1027
+ switch (properties->computeMajor) {
1028
+ case 2:
1029
+ case 3:
1030
+ case 4:
1031
+ case 5:
1032
+ case 6:
1033
+ *limit = properties->sharedMemPerBlock;
1034
+ break;
1035
+ case 7:
1036
+ case 8:
1037
+ case 9:
1038
+ switch (shmemLimitConfig) {
1039
+ default:
1040
+ case FUNC_SHMEM_LIMIT_DEFAULT:
1041
+ *limit = properties->sharedMemPerBlock;
1042
+ break;
1043
+ case FUNC_SHMEM_LIMIT_OPTIN:
1044
+ if (smemPerCta > properties->sharedMemPerBlock) {
1045
+ *limit = properties->sharedMemPerBlockOptin;
1046
+ }
1047
+ else {
1048
+ *limit = properties->sharedMemPerBlock;
1049
+ }
1050
+ break;
1051
+ }
1052
+ break;
1053
+ default:
1054
+ return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
1055
+ }
1056
+
1057
+ // Starting Ampere, CUDA driver reserves additional shared memory per block
1058
+ if (properties->computeMajor >= 8) {
1059
+ *limit += properties->reservedSharedMemPerBlock;
1060
+ }
1061
+
1062
+ return CUDA_OCC_SUCCESS;
1063
+ }
1064
+
1065
+ /**
1066
+ * Partitioned global caching mode support
1067
+ */
1068
+ static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties)
1069
+ {
1070
+ *limit = PARTITIONED_GC_NOT_SUPPORTED;
1071
+
1072
+ if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) ||
1073
+ properties->computeMajor == 6) {
1074
+ *limit = PARTITIONED_GC_SUPPORTED;
1075
+ }
1076
+
1077
+ if (properties->computeMajor == 6 && properties->computeMinor == 0) {
1078
+ *limit = PARTITIONED_GC_NOT_SUPPORTED;
1079
+ }
1080
+
1081
+ return CUDA_OCC_SUCCESS;
1082
+ }
1083
+
1084
+ ///////////////////////////////////////////////
1085
+ // User Input Sanity //
1086
+ ///////////////////////////////////////////////
1087
+
1088
+ static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties)
1089
+ {
1090
+ // Verify device properties
1091
+ //
1092
+ // Each of these limits must be a positive number.
1093
+ //
1094
+ // Compute capacity is checked during the occupancy calculation
1095
+ //
1096
+ if (properties->maxThreadsPerBlock <= 0 ||
1097
+ properties->maxThreadsPerMultiprocessor <= 0 ||
1098
+ properties->regsPerBlock <= 0 ||
1099
+ properties->regsPerMultiprocessor <= 0 ||
1100
+ properties->warpSize <= 0 ||
1101
+ properties->sharedMemPerBlock <= 0 ||
1102
+ properties->sharedMemPerMultiprocessor <= 0 ||
1103
+ properties->numSms <= 0) {
1104
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1105
+ }
1106
+
1107
+ return CUDA_OCC_SUCCESS;
1108
+ }
1109
+
1110
+ static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes)
1111
+ {
1112
+ // Verify function attributes
1113
+ //
1114
+ if (attributes->maxThreadsPerBlock <= 0 ||
1115
+ attributes->numRegs < 0) { // Compiler may choose not to use
1116
+ // any register (empty kernels,
1117
+ // etc.)
1118
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1119
+ }
1120
+
1121
+ return CUDA_OCC_SUCCESS;
1122
+ }
1123
+
1124
+ static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state)
1125
+ {
1126
+ (void)state; // silence unused-variable warning
1127
+ // Placeholder
1128
+ //
1129
+
1130
+ return CUDA_OCC_SUCCESS;
1131
+ }
1132
+
1133
+ static __OCC_INLINE cudaOccError cudaOccInputCheck(
1134
+ const cudaOccDeviceProp *properties,
1135
+ const cudaOccFuncAttributes *attributes,
1136
+ const cudaOccDeviceState *state)
1137
+ {
1138
+ cudaOccError status = CUDA_OCC_SUCCESS;
1139
+
1140
+ status = cudaOccDevicePropCheck(properties);
1141
+ if (status != CUDA_OCC_SUCCESS) {
1142
+ return status;
1143
+ }
1144
+
1145
+ status = cudaOccFuncAttributesCheck(attributes);
1146
+ if (status != CUDA_OCC_SUCCESS) {
1147
+ return status;
1148
+ }
1149
+
1150
+ status = cudaOccDeviceStateCheck(state);
1151
+ if (status != CUDA_OCC_SUCCESS) {
1152
+ return status;
1153
+ }
1154
+
1155
+ return status;
1156
+ }
1157
+
1158
+ ///////////////////////////////////////////////
1159
+ // Occupancy calculation Functions //
1160
+ ///////////////////////////////////////////////
1161
+
1162
+ static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
1163
+ const cudaOccDeviceProp *properties,
1164
+ const cudaOccFuncAttributes *attributes)
1165
+ {
1166
+ cudaOccPartitionedGCSupport gcSupport;
1167
+ cudaOccPartitionedGCConfig gcConfig;
1168
+
1169
+ cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties);
1170
+
1171
+ gcConfig = attributes->partitionedGCConfig;
1172
+
1173
+ if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) {
1174
+ gcConfig = PARTITIONED_GC_OFF;
1175
+ }
1176
+
1177
+ return gcConfig;
1178
+ }
1179
+
1180
+ // Warp limit
1181
+ //
1182
+ static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit(
1183
+ int *limit,
1184
+ cudaOccPartitionedGCConfig gcConfig,
1185
+ const cudaOccDeviceProp *properties,
1186
+ const cudaOccFuncAttributes *attributes,
1187
+ int blockSize)
1188
+ {
1189
+ cudaOccError status = CUDA_OCC_SUCCESS;
1190
+ int maxWarpsPerSm;
1191
+ int warpsAllocatedPerCTA;
1192
+ int maxBlocks;
1193
+ (void)attributes; // silence unused-variable warning
1194
+
1195
+ if (blockSize > properties->maxThreadsPerBlock) {
1196
+ maxBlocks = 0;
1197
+ }
1198
+ else {
1199
+ maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize;
1200
+ warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
1201
+ maxBlocks = 0;
1202
+
1203
+ if (gcConfig != PARTITIONED_GC_OFF) {
1204
+ int maxBlocksPerSmPartition;
1205
+ int maxWarpsPerSmPartition;
1206
+
1207
+ // If partitioned global caching is on, then a CTA can only use a SM
1208
+ // partition (a half SM), and thus a half of the warp slots
1209
+ // available per SM
1210
+ //
1211
+ maxWarpsPerSmPartition = maxWarpsPerSm / 2;
1212
+ maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
1213
+ maxBlocks = maxBlocksPerSmPartition * 2;
1214
+ }
1215
+ // On hardware that supports partitioned global caching, each half SM is
1216
+ // guaranteed to support at least 32 warps (maximum number of warps of a
1217
+ // CTA), so caching will not cause 0 occupancy due to insufficient warp
1218
+ // allocation slots.
1219
+ //
1220
+ else {
1221
+ maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
1222
+ }
1223
+ }
1224
+
1225
+ *limit = maxBlocks;
1226
+
1227
+ return status;
1228
+ }
1229
+
1230
+ // Shared memory limit
1231
+ //
1232
+ static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit(
1233
+ int *limit,
1234
+ cudaOccResult *result,
1235
+ const cudaOccDeviceProp *properties,
1236
+ const cudaOccFuncAttributes *attributes,
1237
+ const cudaOccDeviceState *state,
1238
+ int blockSize,
1239
+ size_t dynamicSmemSize)
1240
+ {
1241
+ cudaOccError status = CUDA_OCC_SUCCESS;
1242
+ int allocationGranularity;
1243
+ size_t userSmemPreference = 0;
1244
+ size_t totalSmemUsagePerCTA;
1245
+ size_t maxSmemUsagePerCTA;
1246
+ size_t smemAllocatedPerCTA;
1247
+ size_t staticSmemSize;
1248
+ size_t sharedMemPerMultiprocessor;
1249
+ size_t smemLimitPerCTA;
1250
+ int maxBlocks;
1251
+ int dynamicSmemSizeExceeded = 0;
1252
+ int totalSmemSizeExceeded = 0;
1253
+ (void)blockSize; // silence unused-variable warning
1254
+
1255
+ status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
1256
+ if (status != CUDA_OCC_SUCCESS) {
1257
+ return status;
1258
+ }
1259
+
1260
+ // Obtain the user preferred shared memory size. This setting is ignored if
1261
+ // user requests more shared memory than preferred.
1262
+ //
1263
+ status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
1264
+ if (status != CUDA_OCC_SUCCESS) {
1265
+ return status;
1266
+ }
1267
+
1268
+ staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock;
1269
+ totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize;
1270
+ smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);
1271
+
1272
+ maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes;
1273
+
1274
+ dynamicSmemSizeExceeded = 0;
1275
+ totalSmemSizeExceeded = 0;
1276
+
1277
+ // Obtain the user set maximum dynamic size if it exists
1278
+ // If so, the current launch dynamic shared memory must not
1279
+ // exceed the set limit
1280
+ if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT &&
1281
+ dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) {
1282
+ dynamicSmemSizeExceeded = 1;
1283
+ }
1284
+
1285
+ status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA);
1286
+ if (status != CUDA_OCC_SUCCESS) {
1287
+ return status;
1288
+ }
1289
+
1290
+ if (smemAllocatedPerCTA > smemLimitPerCTA) {
1291
+ totalSmemSizeExceeded = 1;
1292
+ }
1293
+
1294
+ if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) {
1295
+ maxBlocks = 0;
1296
+ }
1297
+ else {
1298
+ // User requested shared memory limit is used as long as it is greater
1299
+ // than the total shared memory used per CTA, i.e. as long as at least
1300
+ // one CTA can be launched.
1301
+ if (userSmemPreference >= smemAllocatedPerCTA) {
1302
+ sharedMemPerMultiprocessor = userSmemPreference;
1303
+ }
1304
+ else {
1305
+ // On Volta+, user requested shared memory will limit occupancy
1306
+ // if it's less than shared memory per CTA. Otherwise, the
1307
+ // maximum shared memory limit is used.
1308
+ if (properties->computeMajor >= 7) {
1309
+ sharedMemPerMultiprocessor = smemAllocatedPerCTA;
1310
+ status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties);
1311
+ if (status != CUDA_OCC_SUCCESS) {
1312
+ return status;
1313
+ }
1314
+ }
1315
+ else {
1316
+ sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor;
1317
+ }
1318
+ }
1319
+
1320
+ if (smemAllocatedPerCTA > 0) {
1321
+ maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
1322
+ }
1323
+ else {
1324
+ maxBlocks = INT_MAX;
1325
+ }
1326
+ }
1327
+
1328
+ result->allocatedSharedMemPerBlock = smemAllocatedPerCTA;
1329
+
1330
+ *limit = maxBlocks;
1331
+
1332
+ return status;
1333
+ }
1334
+
1335
+ static __OCC_INLINE
1336
+ cudaOccError cudaOccMaxBlocksPerSMRegsLimit(
1337
+ int *limit,
1338
+ cudaOccPartitionedGCConfig *gcConfig,
1339
+ cudaOccResult *result,
1340
+ const cudaOccDeviceProp *properties,
1341
+ const cudaOccFuncAttributes *attributes,
1342
+ int blockSize)
1343
+ {
1344
+ cudaOccError status = CUDA_OCC_SUCCESS;
1345
+ int allocationGranularity;
1346
+ int warpsAllocatedPerCTA;
1347
+ int regsAllocatedPerCTA;
1348
+ int regsAssumedPerCTA;
1349
+ int regsPerWarp;
1350
+ int regsAllocatedPerWarp;
1351
+ int numSubPartitions;
1352
+ int numRegsPerSubPartition;
1353
+ int numWarpsPerSubPartition;
1354
+ int numWarpsPerSM;
1355
+ int maxBlocks;
1356
+ int maxRegsPerThread;
1357
+
1358
+ status = cudaOccRegAllocationGranularity(
1359
+ &allocationGranularity,
1360
+ properties);
1361
+ if (status != CUDA_OCC_SUCCESS) {
1362
+ return status;
1363
+ }
1364
+
1365
+ status = cudaOccRegAllocationMaxPerThread(
1366
+ &maxRegsPerThread,
1367
+ properties);
1368
+ if (status != CUDA_OCC_SUCCESS) {
1369
+ return status;
1370
+ }
1371
+
1372
+ status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties);
1373
+ if (status != CUDA_OCC_SUCCESS) {
1374
+ return status;
1375
+ }
1376
+
1377
+ warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
1378
+
1379
+ // GPUs of compute capability 2.x and higher allocate registers to warps
1380
+ //
1381
+ // Number of regs per warp is regs per thread x warp size, rounded up to
1382
+ // register allocation granularity
1383
+ //
1384
+ regsPerWarp = attributes->numRegs * properties->warpSize;
1385
+ regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
1386
+ regsAllocatedPerCTA = regsAllocatedPerWarp * warpsAllocatedPerCTA;
1387
+
1388
+ // Hardware verifies if a launch fits the per-CTA register limit. For
1389
+ // historical reasons, the verification logic assumes register
1390
+ // allocations are made to all partitions simultaneously. Therefore, to
1391
+ // simulate the hardware check, the warp allocation needs to be rounded
1392
+ // up to the number of partitions.
1393
+ //
1394
+ regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);
1395
+
1396
+ if (properties->regsPerBlock < regsAssumedPerCTA || // Hardware check
1397
+ properties->regsPerBlock < regsAllocatedPerCTA || // Software check
1398
+ attributes->numRegs > maxRegsPerThread) { // Per thread limit check
1399
+ maxBlocks = 0;
1400
+ }
1401
+ else {
1402
+ if (regsAllocatedPerWarp > 0) {
1403
+ // Registers are allocated in each sub-partition. The max number
1404
+ // of warps that can fit on an SM is equal to the max number of
1405
+ // warps per sub-partition x number of sub-partitions.
1406
+ //
1407
+ numRegsPerSubPartition = properties->regsPerMultiprocessor / numSubPartitions;
1408
+ numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;
1409
+
1410
+ maxBlocks = 0;
1411
+
1412
+ if (*gcConfig != PARTITIONED_GC_OFF) {
1413
+ int numSubPartitionsPerSmPartition;
1414
+ int numWarpsPerSmPartition;
1415
+ int maxBlocksPerSmPartition;
1416
+
1417
+ // If partitioned global caching is on, then a CTA can only
1418
+ // use a half SM, and thus a half of the registers available
1419
+ // per SM
1420
+ //
1421
+ numSubPartitionsPerSmPartition = numSubPartitions / 2;
1422
+ numWarpsPerSmPartition = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
1423
+ maxBlocksPerSmPartition = numWarpsPerSmPartition / warpsAllocatedPerCTA;
1424
+ maxBlocks = maxBlocksPerSmPartition * 2;
1425
+ }
1426
+
1427
+ // Try again if partitioned global caching is not enabled, or if
1428
+ // the CTA cannot fit on the SM with caching on (maxBlocks == 0). In the latter
1429
+ // case, the device will automatically turn off caching, except
1430
+ // if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate
1431
+ // occupancy and launch configuration.
1432
+ //
1433
+ if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) {
1434
+ // In case *gcConfig was PARTITIONED_GC_ON flip it OFF since
1435
+ // this is what it will be if we spread CTA across partitions.
1436
+ //
1437
+ *gcConfig = PARTITIONED_GC_OFF;
1438
+ numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
1439
+ maxBlocks = numWarpsPerSM / warpsAllocatedPerCTA;
1440
+ }
1441
+ }
1442
+ else {
1443
+ maxBlocks = INT_MAX;
1444
+ }
1445
+ }
1446
+
1447
+
1448
+ result->allocatedRegistersPerBlock = regsAllocatedPerCTA;
1449
+
1450
+ *limit = maxBlocks;
1451
+
1452
+ return status;
1453
+ }
1454
+
1455
+ // Barrier limit
1456
+ //
1457
+ static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit(
1458
+ int *limit,
1459
+ int ctaLimitBlocks,
1460
+ const cudaOccFuncAttributes *attributes)
1461
+ {
1462
+ cudaOccError status = CUDA_OCC_SUCCESS;
1463
+ int numBarriersAvailable = ctaLimitBlocks * 2;
1464
+ int numBarriersUsed = attributes->numBlockBarriers;
1465
+ int maxBlocks = INT_MAX;
1466
+
1467
+ if (numBarriersUsed) {
1468
+ maxBlocks = numBarriersAvailable / numBarriersUsed;
1469
+ }
1470
+
1471
+ *limit = maxBlocks;
1472
+
1473
+ return status;
1474
+ }
1475
+
1476
+ ///////////////////////////////////
1477
+ // API Implementations //
1478
+ ///////////////////////////////////
1479
+
1480
+ static __OCC_INLINE
1481
+ cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
1482
+ cudaOccResult *result,
1483
+ const cudaOccDeviceProp *properties,
1484
+ const cudaOccFuncAttributes *attributes,
1485
+ const cudaOccDeviceState *state,
1486
+ int blockSize,
1487
+ size_t dynamicSmemSize)
1488
+ {
1489
+ cudaOccError status = CUDA_OCC_SUCCESS;
1490
+ int ctaLimitWarps = 0;
1491
+ int ctaLimitBlocks = 0;
1492
+ int ctaLimitSMem = 0;
1493
+ int ctaLimitRegs = 0;
1494
+ int ctaLimitBars = 0;
1495
+ int ctaLimit = 0;
1496
+ unsigned int limitingFactors = 0;
1497
+
1498
+ cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF;
1499
+
1500
+ if (!result || !properties || !attributes || !state || blockSize <= 0) {
1501
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1502
+ }
1503
+
1504
+ ///////////////////////////
1505
+ // Check user input
1506
+ ///////////////////////////
1507
+
1508
+ status = cudaOccInputCheck(properties, attributes, state);
1509
+ if (status != CUDA_OCC_SUCCESS) {
1510
+ return status;
1511
+ }
1512
+
1513
+ ///////////////////////////
1514
+ // Initialization
1515
+ ///////////////////////////
1516
+
1517
+ gcConfig = cudaOccPartitionedGCExpected(properties, attributes);
1518
+
1519
+ ///////////////////////////
1520
+ // Compute occupancy
1521
+ ///////////////////////////
1522
+
1523
+ // Limits due to registers/SM
1524
+ // Also compute if partitioned global caching has to be turned off
1525
+ //
1526
+ status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize);
1527
+ if (status != CUDA_OCC_SUCCESS) {
1528
+ return status;
1529
+ }
1530
+
1531
+ // SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4.
1532
+ // As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x.
1533
+ // For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0),
1534
+ // we do not let it run on any Pascal processor, even though it may be able to run on GP100.
1535
+ // Therefore, we check the occupancy on GP10x when it can run on GP100
1536
+ //
1537
+ if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) {
1538
+ cudaOccDeviceProp propertiesGP10x;
1539
+ cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig;
1540
+ int ctaLimitRegsGP10x = 0;
1541
+
1542
+ // Set up properties for GP10x
1543
+ memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x));
1544
+ propertiesGP10x.computeMinor = 1;
1545
+
1546
+ status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize);
1547
+ if (status != CUDA_OCC_SUCCESS) {
1548
+ return status;
1549
+ }
1550
+
1551
+ if (ctaLimitRegsGP10x == 0) {
1552
+ ctaLimitRegs = 0;
1553
+ }
1554
+ }
1555
+
1556
+ // Limits due to warps/SM
1557
+ //
1558
+ status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize);
1559
+ if (status != CUDA_OCC_SUCCESS) {
1560
+ return status;
1561
+ }
1562
+
1563
+ // Limits due to blocks/SM
1564
+ //
1565
+ status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties);
1566
+ if (status != CUDA_OCC_SUCCESS) {
1567
+ return status;
1568
+ }
1569
+
1570
+ // Limits due to shared memory/SM
1571
+ //
1572
+ status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize);
1573
+ if (status != CUDA_OCC_SUCCESS) {
1574
+ return status;
1575
+ }
1576
+
1577
+ ///////////////////////////
1578
+ // Overall occupancy
1579
+ ///////////////////////////
1580
+
1581
+ // Overall limit is min() of limits due to above reasons
1582
+ //
1583
+ ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));
1584
+
1585
+ // Determine occupancy limiting factors
1586
+ //
1587
+ if (ctaLimit == ctaLimitWarps) {
1588
+ limitingFactors |= OCC_LIMIT_WARPS;
1589
+ }
1590
+ if (ctaLimit == ctaLimitRegs) {
1591
+ limitingFactors |= OCC_LIMIT_REGISTERS;
1592
+ }
1593
+ if (ctaLimit == ctaLimitSMem) {
1594
+ limitingFactors |= OCC_LIMIT_SHARED_MEMORY;
1595
+ }
1596
+ if (ctaLimit == ctaLimitBlocks) {
1597
+ limitingFactors |= OCC_LIMIT_BLOCKS;
1598
+ }
1599
+
1600
+ // For Hopper onwards compute the limits to occupancy based on block barrier count
1601
+ //
1602
+ if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) {
1603
+ // Limits due to barrier/SM
1604
+ //
1605
+ status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, attributes);
1606
+ if (status != CUDA_OCC_SUCCESS) {
1607
+ return status;
1608
+ }
1609
+
1610
+ // Recompute overall limit based on barrier/SM
1611
+ //
1612
+ ctaLimit = __occMin(ctaLimitBars, ctaLimit);
1613
+
1614
+ // Determine if this is occupancy limiting factor
1615
+ //
1616
+ if (ctaLimit == ctaLimitBars) {
1617
+ limitingFactors |= OCC_LIMIT_BARRIERS;
1618
+ }
1619
+ }
1620
+ else {
1621
+ ctaLimitBars = INT_MAX;
1622
+ }
1623
+
1624
+ // Fill in the return values
1625
+ //
1626
+ result->limitingFactors = limitingFactors;
1627
+
1628
+ result->blockLimitRegs = ctaLimitRegs;
1629
+ result->blockLimitSharedMem = ctaLimitSMem;
1630
+ result->blockLimitWarps = ctaLimitWarps;
1631
+ result->blockLimitBlocks = ctaLimitBlocks;
1632
+ result->blockLimitBarriers = ctaLimitBars;
1633
+ result->partitionedGCConfig = gcConfig;
1634
+
1635
+ // Final occupancy
1636
+ result->activeBlocksPerMultiprocessor = ctaLimit;
1637
+
1638
+ return CUDA_OCC_SUCCESS;
1639
+ }
1640
+
1641
+ static __OCC_INLINE
1642
+ cudaOccError cudaOccAvailableDynamicSMemPerBlock(
1643
+ size_t *bytesAvailable,
1644
+ const cudaOccDeviceProp *properties,
1645
+ const cudaOccFuncAttributes *attributes,
1646
+ const cudaOccDeviceState *state,
1647
+ int numBlocks,
1648
+ int blockSize)
1649
+ {
1650
+ int allocationGranularity;
1651
+ size_t smemLimitPerBlock;
1652
+ size_t smemAvailableForDynamic;
1653
+ size_t userSmemPreference = 0;
1654
+ size_t sharedMemPerMultiprocessor;
1655
+ cudaOccResult result;
1656
+ cudaOccError status = CUDA_OCC_SUCCESS;
1657
+
1658
+ if (numBlocks <= 0)
1659
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1660
+
1661
+ // First compute occupancy of potential kernel launch.
1662
+ //
1663
+ status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0);
1664
+ if (status != CUDA_OCC_SUCCESS) {
1665
+ return status;
1666
+ }
1667
+ // Check if occupancy is achievable given user requested number of blocks.
1668
+ //
1669
+ if (result.activeBlocksPerMultiprocessor < numBlocks) {
1670
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1671
+ }
1672
+
1673
+ status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
1674
+ if (status != CUDA_OCC_SUCCESS) {
1675
+ return status;
1676
+ }
1677
+
1678
+ // Return the per block shared memory limit based on function config.
1679
+ //
1680
+ status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor);
1681
+ if (status != CUDA_OCC_SUCCESS) {
1682
+ return status;
1683
+ }
1684
+
1685
+ // If there is only a single block needed per SM, then the user preference can be ignored and the fully SW
1686
+ // limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user
1687
+ // preference sets the total limit of available shared memory.
1688
+ //
1689
+ cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
1690
+ if (numBlocks == 1) {
1691
+ sharedMemPerMultiprocessor = smemLimitPerBlock;
1692
+ }
1693
+ else {
1694
+ if (!userSmemPreference) {
1695
+ userSmemPreference = 1 ;
1696
+ status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties);
1697
+ if (status != CUDA_OCC_SUCCESS) {
1698
+ return status;
1699
+ }
1700
+ }
1701
+ sharedMemPerMultiprocessor = userSmemPreference;
1702
+ }
1703
+
1704
+ // Compute total shared memory available per SM
1705
+ //
1706
+ smemAvailableForDynamic = sharedMemPerMultiprocessor / numBlocks;
1707
+ smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity;
1708
+
1709
+ // Cap shared memory
1710
+ //
1711
+ if (smemAvailableForDynamic > smemLimitPerBlock) {
1712
+ smemAvailableForDynamic = smemLimitPerBlock;
1713
+ }
1714
+
1715
+ // Now compute dynamic shared memory size
1716
+ smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes;
1717
+
1718
+ // Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute()
1719
+ //
1720
+ if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes)
1721
+ smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes;
1722
+
1723
+ *bytesAvailable = smemAvailableForDynamic;
1724
+ return CUDA_OCC_SUCCESS;
1725
+ }
1726
+
1727
+ static __OCC_INLINE
1728
+ cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
1729
+ int *minGridSize,
1730
+ int *blockSize,
1731
+ const cudaOccDeviceProp *properties,
1732
+ const cudaOccFuncAttributes *attributes,
1733
+ const cudaOccDeviceState *state,
1734
+ size_t (*blockSizeToDynamicSMemSize)(int),
1735
+ size_t dynamicSMemSize)
1736
+ {
1737
+ cudaOccError status = CUDA_OCC_SUCCESS;
1738
+ cudaOccResult result;
1739
+
1740
+ // Limits
1741
+ int occupancyLimit;
1742
+ int granularity;
1743
+ int blockSizeLimit;
1744
+
1745
+ // Recorded maximum
1746
+ int maxBlockSize = 0;
1747
+ int numBlocks = 0;
1748
+ int maxOccupancy = 0;
1749
+
1750
+ // Temporary
1751
+ int blockSizeToTryAligned;
1752
+ int blockSizeToTry;
1753
+ int blockSizeLimitAligned;
1754
+ int occupancyInBlocks;
1755
+ int occupancyInThreads;
1756
+
1757
+ ///////////////////////////
1758
+ // Check user input
1759
+ ///////////////////////////
1760
+
1761
+ if (!minGridSize || !blockSize || !properties || !attributes || !state) {
1762
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1763
+ }
1764
+
1765
+ status = cudaOccInputCheck(properties, attributes, state);
1766
+ if (status != CUDA_OCC_SUCCESS) {
1767
+ return status;
1768
+ }
1769
+
1770
+ /////////////////////////////////////////////////////////////////////////////////
1771
+ // Try each block size, and pick the block size with maximum occupancy
1772
+ /////////////////////////////////////////////////////////////////////////////////
1773
+
1774
+ occupancyLimit = properties->maxThreadsPerMultiprocessor;
1775
+ granularity = properties->warpSize;
1776
+
1777
+ blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
1778
+ blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
1779
+
1780
+ for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
1781
+ blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
1782
+
1783
+ // Ignore dynamicSMemSize if the user provides a mapping
1784
+ //
1785
+ if (blockSizeToDynamicSMemSize) {
1786
+ dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry);
1787
+ }
1788
+
1789
+ status = cudaOccMaxActiveBlocksPerMultiprocessor(
1790
+ &result,
1791
+ properties,
1792
+ attributes,
1793
+ state,
1794
+ blockSizeToTry,
1795
+ dynamicSMemSize);
1796
+
1797
+ if (status != CUDA_OCC_SUCCESS) {
1798
+ return status;
1799
+ }
1800
+
1801
+ occupancyInBlocks = result.activeBlocksPerMultiprocessor;
1802
+ occupancyInThreads = blockSizeToTry * occupancyInBlocks;
1803
+
1804
+ if (occupancyInThreads > maxOccupancy) {
1805
+ maxBlockSize = blockSizeToTry;
1806
+ numBlocks = occupancyInBlocks;
1807
+ maxOccupancy = occupancyInThreads;
1808
+ }
1809
+
1810
+ // Early out if we have reached the maximum
1811
+ //
1812
+ if (occupancyLimit == maxOccupancy) {
1813
+ break;
1814
+ }
1815
+ }
1816
+
1817
+ ///////////////////////////
1818
+ // Return best available
1819
+ ///////////////////////////
1820
+
1821
+ // Suggested min grid size to achieve a full machine launch
1822
+ //
1823
+ *minGridSize = numBlocks * properties->numSms;
1824
+ *blockSize = maxBlockSize;
1825
+
1826
+ return status;
1827
+ }
1828
+
1829
+
1830
+ #if defined(__cplusplus)
1831
+
1832
+ namespace {
1833
+
1834
+ __OCC_INLINE
1835
+ cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
1836
+ int *minGridSize,
1837
+ int *blockSize,
1838
+ const cudaOccDeviceProp *properties,
1839
+ const cudaOccFuncAttributes *attributes,
1840
+ const cudaOccDeviceState *state,
1841
+ size_t dynamicSMemSize)
1842
+ {
1843
+ return cudaOccMaxPotentialOccupancyBlockSize(
1844
+ minGridSize,
1845
+ blockSize,
1846
+ properties,
1847
+ attributes,
1848
+ state,
1849
+ NULL,
1850
+ dynamicSMemSize);
1851
+ }
1852
+
1853
+ template <typename UnaryFunction>
1854
+ __OCC_INLINE
1855
+ cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
1856
+ int *minGridSize,
1857
+ int *blockSize,
1858
+ const cudaOccDeviceProp *properties,
1859
+ const cudaOccFuncAttributes *attributes,
1860
+ const cudaOccDeviceState *state,
1861
+ UnaryFunction blockSizeToDynamicSMemSize)
1862
+ {
1863
+ cudaOccError status = CUDA_OCC_SUCCESS;
1864
+ cudaOccResult result;
1865
+
1866
+ // Limits
1867
+ int occupancyLimit;
1868
+ int granularity;
1869
+ int blockSizeLimit;
1870
+
1871
+ // Recorded maximum
1872
+ int maxBlockSize = 0;
1873
+ int numBlocks = 0;
1874
+ int maxOccupancy = 0;
1875
+
1876
+ // Temporary
1877
+ int blockSizeToTryAligned;
1878
+ int blockSizeToTry;
1879
+ int blockSizeLimitAligned;
1880
+ int occupancyInBlocks;
1881
+ int occupancyInThreads;
1882
+ size_t dynamicSMemSize;
1883
+
1884
+ ///////////////////////////
1885
+ // Check user input
1886
+ ///////////////////////////
1887
+
1888
+ if (!minGridSize || !blockSize || !properties || !attributes || !state) {
1889
+ return CUDA_OCC_ERROR_INVALID_INPUT;
1890
+ }
1891
+
1892
+ status = cudaOccInputCheck(properties, attributes, state);
1893
+ if (status != CUDA_OCC_SUCCESS) {
1894
+ return status;
1895
+ }
1896
+
1897
+ /////////////////////////////////////////////////////////////////////////////////
1898
+ // Try each block size, and pick the block size with maximum occupancy
1899
+ /////////////////////////////////////////////////////////////////////////////////
1900
+
1901
+ occupancyLimit = properties->maxThreadsPerMultiprocessor;
1902
+ granularity = properties->warpSize;
1903
+ blockSizeLimit = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
1904
+ blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
1905
+
1906
+ for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
1907
+ blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
1908
+
1909
+ dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
1910
+
1911
+ status = cudaOccMaxActiveBlocksPerMultiprocessor(
1912
+ &result,
1913
+ properties,
1914
+ attributes,
1915
+ state,
1916
+ blockSizeToTry,
1917
+ dynamicSMemSize);
1918
+
1919
+ if (status != CUDA_OCC_SUCCESS) {
1920
+ return status;
1921
+ }
1922
+
1923
+ occupancyInBlocks = result.activeBlocksPerMultiprocessor;
1924
+
1925
+ occupancyInThreads = blockSizeToTry * occupancyInBlocks;
1926
+
1927
+ if (occupancyInThreads > maxOccupancy) {
1928
+ maxBlockSize = blockSizeToTry;
1929
+ numBlocks = occupancyInBlocks;
1930
+ maxOccupancy = occupancyInThreads;
1931
+ }
1932
+
1933
+ // Early out if we have reached the maximum
1934
+ //
1935
+ if (occupancyLimit == maxOccupancy) {
1936
+ break;
1937
+ }
1938
+ }
1939
+
1940
+ ///////////////////////////
1941
+ // Return best available
1942
+ ///////////////////////////
1943
+
1944
+ // Suggested min grid size to achieve a full machine launch
1945
+ //
1946
+ *minGridSize = numBlocks * properties->numSms;
1947
+ *blockSize = maxBlockSize;
1948
+
1949
+ return status;
1950
+ }
1951
+
1952
+ } // namespace anonymous
1953
+
1954
+ #endif /*__cplusplus */
1955
+
1956
+ #undef __OCC_INLINE
1957
+
1958
+ #endif /*__cuda_occupancy_h__*/
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #ifndef _CUDA_PIPELINE_H_
51
+ # define _CUDA_PIPELINE_H_
52
+
53
+ # include "cuda_pipeline_primitives.h"
54
+
55
+ # if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
56
+ # error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
57
+ -std=c++11 compiler option.
58
+ # endif
59
+
60
+ # if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
61
+ # include "cuda_awbarrier.h"
62
+ # endif
63
+
64
+ // Integration with libcu++'s cuda::barrier<cuda::thread_scope_block>.
65
+
66
+ # if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
67
+ # if defined(_LIBCUDACXX_CUDA_ABI_VERSION)
68
+ # define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION
69
+ # else
70
+ # define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4
71
+ # endif
72
+
73
+ # define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y
74
+ # define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y)
75
+ # define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION)
76
+
77
+ namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE {
78
+ struct __block_scope_barrier_base;
79
+ }}
80
+
81
+ # endif
82
+
83
+ _CUDA_PIPELINE_BEGIN_NAMESPACE
84
+
85
+ template<size_t N, typename T>
86
+ _CUDA_PIPELINE_QUALIFIER
87
+ auto segment(T* ptr) -> T(*)[N];
88
+
89
+ class pipeline {
90
+ public:
91
+ pipeline(const pipeline&) = delete;
92
+ pipeline(pipeline&&) = delete;
93
+ pipeline& operator=(const pipeline&) = delete;
94
+ pipeline& operator=(pipeline&&) = delete;
95
+
96
+ _CUDA_PIPELINE_QUALIFIER pipeline();
97
+ _CUDA_PIPELINE_QUALIFIER size_t commit();
98
+ _CUDA_PIPELINE_QUALIFIER void commit_and_wait();
99
+ _CUDA_PIPELINE_QUALIFIER void wait(size_t batch);
100
+ template<unsigned N>
101
+ _CUDA_PIPELINE_QUALIFIER void wait_prior();
102
+
103
+ # if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
104
+ _CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier);
105
+ _CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier);
106
+ # endif
107
+
108
+ private:
109
+ size_t current_batch;
110
+ };
111
+
112
+ template<class T>
113
+ _CUDA_PIPELINE_QUALIFIER
114
+ void memcpy_async(T& dst, const T& src, pipeline& pipe);
115
+
116
+ template<class T, size_t DstN, size_t SrcN>
117
+ _CUDA_PIPELINE_QUALIFIER
118
+ void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe);
119
+
120
+ template<size_t N, typename T>
121
+ _CUDA_PIPELINE_QUALIFIER
122
+ auto segment(T* ptr) -> T(*)[N]
123
+ {
124
+ return (T(*)[N])ptr;
125
+ }
126
+
127
+ _CUDA_PIPELINE_QUALIFIER
128
+ pipeline::pipeline()
129
+ : current_batch(0)
130
+ {
131
+ }
132
+
133
+ _CUDA_PIPELINE_QUALIFIER
134
+ size_t pipeline::commit()
135
+ {
136
+ _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
137
+ return this->current_batch++;
138
+ }
139
+
140
+ _CUDA_PIPELINE_QUALIFIER
141
+ void pipeline::commit_and_wait()
142
+ {
143
+ (void)pipeline::commit();
144
+ pipeline::wait_prior<0>();
145
+ }
146
+
147
+ _CUDA_PIPELINE_QUALIFIER
148
+ void pipeline::wait(size_t batch)
149
+ {
150
+ const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0;
151
+
152
+ switch (prior) {
153
+ case 0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break;
154
+ case 1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break;
155
+ case 2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break;
156
+ case 3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break;
157
+ case 4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break;
158
+ case 5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break;
159
+ case 6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break;
160
+ case 7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break;
161
+ default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break;
162
+ }
163
+ }
164
+
165
+ template<unsigned N>
166
+ _CUDA_PIPELINE_QUALIFIER
167
+ void pipeline::wait_prior()
168
+ {
169
+ _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<N>();
170
+ }
171
+
172
+ # if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
173
+ _CUDA_PIPELINE_QUALIFIER
174
+ void pipeline::arrive_on(awbarrier& barrier)
175
+ {
176
+ _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier);
177
+ }
178
+
179
+ _CUDA_PIPELINE_QUALIFIER
180
+ void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier)
181
+ {
182
+ _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast<uint64_t *>(&barrier));
183
+ }
184
+ # endif
185
+
186
+ template<class T>
187
+ _CUDA_PIPELINE_QUALIFIER
188
+ void memcpy_async(T& dst, const T& src, pipeline& pipe)
189
+ {
190
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&src) & (alignof(T) - 1)));
191
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&dst) & (alignof(T) - 1)));
192
+
193
+ if (__is_trivially_copyable(T)) {
194
+ _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed<sizeof(T), alignof(T)>(
195
+ reinterpret_cast<void*>(&dst), reinterpret_cast<const void*>(&src));
196
+ } else {
197
+ dst = src;
198
+ }
199
+ }
200
+
201
+ template<class T, size_t DstN, size_t SrcN>
202
+ _CUDA_PIPELINE_QUALIFIER
203
+ void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe)
204
+ {
205
+ constexpr size_t dst_size = sizeof(*dst);
206
+ constexpr size_t src_size = sizeof(*src);
207
+ static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size.");
208
+ static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size.");
209
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (dst_size - 1)));
210
+ _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (dst_size - 1)));
211
+
212
+ if (__is_trivially_copyable(T)) {
213
+ _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict<sizeof(*dst), sizeof(*src)>(
214
+ reinterpret_cast<void*>(*dst), reinterpret_cast<const void*>(*src));
215
+ } else {
216
+ for (size_t i = 0; i < DstN; ++i) {
217
+ (*dst)[i] = (i < SrcN) ? (*src)[i] : T();
218
+ }
219
+ }
220
+ }
221
+
222
+ _CUDA_PIPELINE_END_NAMESPACE
223
+
224
+ #endif /* !_CUDA_PIPELINE_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h ADDED
The diff for this file is too large to render. See raw diff
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_TEXTURE_TYPES_H__)
51
+ #define __CUDA_TEXTURE_TYPES_H__
52
+
53
+ #if defined(__cplusplus) && defined(__CUDACC__)
54
+
55
+ /*******************************************************************************
56
+ * *
57
+ * *
58
+ * *
59
+ *******************************************************************************/
60
+
61
+ #if !defined(__CUDACC_RTC__)
62
+ #define EXCLUDE_FROM_RTC
63
+ #include "channel_descriptor.h"
64
+ #undef EXCLUDE_FROM_RTC
65
+ #endif /* !__CUDACC_RTC__ */
66
+ #include "cuda_runtime_api.h"
67
+
68
+ /*******************************************************************************
69
+ * *
70
+ * *
71
+ * *
72
+ *******************************************************************************/
73
+
74
+ template<class T, int texType = cudaTextureType1D, enum cudaTextureReadMode mode = cudaReadModeElementType>
75
+ struct __device_builtin_texture_type__ texture : public textureReference
76
+ {
77
+ #if !defined(__CUDACC_RTC__)
78
+ __host__ texture(int norm = 0,
79
+ enum cudaTextureFilterMode fMode = cudaFilterModePoint,
80
+ enum cudaTextureAddressMode aMode = cudaAddressModeClamp)
81
+ {
82
+ normalized = norm;
83
+ filterMode = fMode;
84
+ addressMode[0] = aMode;
85
+ addressMode[1] = aMode;
86
+ addressMode[2] = aMode;
87
+ channelDesc = cudaCreateChannelDesc<T>();
88
+ sRGB = 0;
89
+ }
90
+
91
+ __host__ texture(int norm,
92
+ enum cudaTextureFilterMode fMode,
93
+ enum cudaTextureAddressMode aMode,
94
+ struct cudaChannelFormatDesc desc)
95
+ {
96
+ normalized = norm;
97
+ filterMode = fMode;
98
+ addressMode[0] = aMode;
99
+ addressMode[1] = aMode;
100
+ addressMode[2] = aMode;
101
+ channelDesc = desc;
102
+ sRGB = 0;
103
+ }
104
+ #endif /* !__CUDACC_RTC__ */
105
+ };
106
+
107
+ #endif /* __cplusplus && __CUDACC__ */
108
+
109
+ #endif /* !__CUDA_TEXTURE_TYPES_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
51
+ #define __DEVICE_ATOMIC_FUNCTIONS_HPP__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
55
+ #else /* __CUDACC_RTC__ */
56
+ #define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ /*******************************************************************************
62
+ * *
63
+ * *
64
+ * *
65
+ *******************************************************************************/
66
+
67
+ #include "cuda_runtime_api.h"
68
+
69
+ /*******************************************************************************
70
+ * *
71
+ * *
72
+ * *
73
+ *******************************************************************************/
74
+
75
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
76
+ {
77
+ return __iAtomicAdd(address, val);
78
+ }
79
+
80
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
81
+ {
82
+ return __uAtomicAdd(address, val);
83
+ }
84
+
85
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
86
+ {
87
+ return __iAtomicAdd(address, (unsigned int)-(int)val);
88
+ }
89
+
90
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
91
+ {
92
+ return __uAtomicAdd(address, (unsigned int)-(int)val);
93
+ }
94
+
95
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
96
+ {
97
+ return __iAtomicExch(address, val);
98
+ }
99
+
100
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
101
+ {
102
+ return __uAtomicExch(address, val);
103
+ }
104
+
105
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
106
+ {
107
+ return __fAtomicExch(address, val);
108
+ }
109
+
110
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
111
+ {
112
+ return __iAtomicMin(address, val);
113
+ }
114
+
115
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
116
+ {
117
+ return __uAtomicMin(address, val);
118
+ }
119
+
120
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
121
+ {
122
+ return __iAtomicMax(address, val);
123
+ }
124
+
125
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
126
+ {
127
+ return __uAtomicMax(address, val);
128
+ }
129
+
130
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
131
+ {
132
+ return __uAtomicInc(address, val);
133
+ }
134
+
135
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
136
+ {
137
+ return __uAtomicDec(address, val);
138
+ }
139
+
140
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
141
+ {
142
+ return __iAtomicAnd(address, val);
143
+ }
144
+
145
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
146
+ {
147
+ return __uAtomicAnd(address, val);
148
+ }
149
+
150
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
151
+ {
152
+ return __iAtomicOr(address, val);
153
+ }
154
+
155
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
156
+ {
157
+ return __uAtomicOr(address, val);
158
+ }
159
+
160
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
161
+ {
162
+ return __iAtomicXor(address, val);
163
+ }
164
+
165
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
166
+ {
167
+ return __uAtomicXor(address, val);
168
+ }
169
+
170
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
171
+ {
172
+ return __iAtomicCAS(address, compare, val);
173
+ }
174
+
175
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
176
+ {
177
+ return __uAtomicCAS(address, compare, val);
178
+ }
179
+
180
+ /*******************************************************************************
181
+ * *
182
+ * *
183
+ * *
184
+ *******************************************************************************/
185
+
186
+ #include "cuda_runtime_api.h"
187
+
188
+ /*******************************************************************************
189
+ * *
190
+ * *
191
+ * *
192
+ *******************************************************************************/
193
+
194
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
195
+ {
196
+ return __ullAtomicAdd(address, val);
197
+ }
198
+
199
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
200
+ {
201
+ return __ullAtomicExch(address, val);
202
+ }
203
+
204
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
205
+ {
206
+ return __ullAtomicCAS(address, compare, val);
207
+ }
208
+
209
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
210
+ {
211
+ return (bool)__any((int)cond);
212
+ }
213
+
214
+ __DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
215
+ {
216
+ return (bool)__all((int)cond);
217
+ }
218
+
219
+ #endif /* __cplusplus && __CUDACC__ */
220
+
221
+ #undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
222
+
223
+ #endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */
224
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("device_double_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "device_double_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
58
+ #endif
59
+
60
+ #include "crt/device_double_functions.h"
61
+
62
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__)
63
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
64
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
65
+ #endif
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__DEVICE_LAUNCH_PARAMETERS_H__)
51
+ #define __DEVICE_LAUNCH_PARAMETERS_H__
52
+
53
+ #include "vector_types.h"
54
+
55
+ #if !defined(__STORAGE__)
56
+
57
+ #if defined(__CUDACC_RTC__)
58
+ #define __STORAGE__ \
59
+ extern const __device__
60
+ #else /* !__CUDACC_RTC__ */
61
+ #define __STORAGE__ \
62
+ extern const
63
+ #endif /* __CUDACC_RTC__ */
64
+
65
+ #endif /* __STORAGE__ */
66
+
67
+ #if defined(__cplusplus)
68
+ extern "C" {
69
+ #endif /* __cplusplus */
70
+
71
+ uint3 __device_builtin__ __STORAGE__ threadIdx;
72
+ uint3 __device_builtin__ __STORAGE__ blockIdx;
73
+ dim3 __device_builtin__ __STORAGE__ blockDim;
74
+ dim3 __device_builtin__ __STORAGE__ gridDim;
75
+ int __device_builtin__ __STORAGE__ warpSize;
76
+
77
+ #undef __STORAGE__
78
+
79
+ #if defined(__cplusplus)
80
+ }
81
+ #endif /* __cplusplus */
82
+
83
+ #if !defined(__cudaGet_threadIdx)
84
+
85
+ #define __cudaGet_threadIdx() \
86
+ threadIdx
87
+
88
+ #endif /* __cudaGet_threadIdx */
89
+
90
+ #if !defined(__cudaGet_blockIdx)
91
+
92
+ #define __cudaGet_blockIdx() \
93
+ blockIdx
94
+
95
+ #endif /* __cudaGet_blockIdx */
96
+
97
+ #if !defined(__cudaGet_blockDim)
98
+
99
+ #define __cudaGet_blockDim() \
100
+ blockDim
101
+
102
+ #endif /* __cudaGet_blockDim */
103
+
104
+ #if !defined(__cudaGet_gridDim)
105
+
106
+ #define __cudaGet_gridDim() \
107
+ gridDim
108
+
109
+ #endif /* __cudaGet_gridDim */
110
+
111
+ #if !defined(__cudaGet_warpSize)
112
+
113
+ #define __cudaGet_warpSize() \
114
+ warpSize
115
+
116
+ #endif /* __cudaGet_warpSize */
117
+
118
+ #endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__DRIVER_FUNCTIONS_H__)
51
+ #define __DRIVER_FUNCTIONS_H__
52
+
53
+ #include "builtin_types.h"
54
+ #include "crt/host_defines.h"
55
+ #include "driver_types.h"
56
+
57
+ /**
58
+ * \addtogroup CUDART_MEMORY
59
+ *
60
+ * @{
61
+ */
62
+
63
+ /**
64
+ * \brief Returns a cudaPitchedPtr based on input parameters
65
+ *
66
+ * Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
67
+ * \p p, \p xsz, and \p ysz.
68
+ *
69
+ * \param d - Pointer to allocated memory
70
+ * \param p - Pitch of allocated memory in bytes
71
+ * \param xsz - Logical width of allocation in elements
72
+ * \param ysz - Logical height of allocation in elements
73
+ *
74
+ * \return
75
+ * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
76
+ *
77
+ * \sa make_cudaExtent, make_cudaPos
78
+ */
79
+ static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
80
+ {
81
+ struct cudaPitchedPtr s;
82
+
83
+ s.ptr = d;
84
+ s.pitch = p;
85
+ s.xsize = xsz;
86
+ s.ysize = ysz;
87
+
88
+ return s;
89
+ }
90
+
91
+ /**
92
+ * \brief Returns a cudaPos based on input parameters
93
+ *
94
+ * Returns a ::cudaPos based on the specified input parameters \p x,
95
+ * \p y, and \p z.
96
+ *
97
+ * \param x - X position
98
+ * \param y - Y position
99
+ * \param z - Z position
100
+ *
101
+ * \return
102
+ * ::cudaPos specified by \p x, \p y, and \p z
103
+ *
104
+ * \sa make_cudaExtent, make_cudaPitchedPtr
105
+ */
106
+ static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z)
107
+ {
108
+ struct cudaPos p;
109
+
110
+ p.x = x;
111
+ p.y = y;
112
+ p.z = z;
113
+
114
+ return p;
115
+ }
116
+
117
+ /**
118
+ * \brief Returns a cudaExtent based on input parameters
119
+ *
120
+ * Returns a ::cudaExtent based on the specified input parameters \p w,
121
+ * \p h, and \p d.
122
+ *
123
+ * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
124
+ * \param h - Height in elements
125
+ * \param d - Depth in elements
126
+ *
127
+ * \return
128
+ * ::cudaExtent specified by \p w, \p h, and \p d
129
+ *
130
+ * \sa make_cudaPitchedPtr, make_cudaPos
131
+ */
132
+ static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d)
133
+ {
134
+ struct cudaExtent e;
135
+
136
+ e.width = w;
137
+ e.height = h;
138
+ e.depth = d;
139
+
140
+ return e;
141
+ }
142
+
143
+ /** @} */ /* END CUDART_MEMORY */
144
+
145
+ #endif /* !__DRIVER_FUNCTIONS_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_types.h ADDED
The diff for this file is too large to render. See raw diff
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_config.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("host_config.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "host_config.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
58
+ #endif
59
+
60
+ #include "crt/host_config.h"
61
+
62
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__)
63
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
64
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
65
+ #endif
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/library_types.h ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__LIBRARY_TYPES_H__)
51
+ #define __LIBRARY_TYPES_H__
52
+
53
+
54
+
55
+ typedef enum cudaDataType_t
56
+ {
57
+ CUDA_R_16F = 2, /* real as a half */
58
+ CUDA_C_16F = 6, /* complex as a pair of half numbers */
59
+ CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
60
+ CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
61
+ CUDA_R_32F = 0, /* real as a float */
62
+ CUDA_C_32F = 4, /* complex as a pair of float numbers */
63
+ CUDA_R_64F = 1, /* real as a double */
64
+ CUDA_C_64F = 5, /* complex as a pair of double numbers */
65
+ CUDA_R_4I = 16, /* real as a signed 4-bit int */
66
+ CUDA_C_4I = 17, /* complex as a pair of signed 4-bit int numbers */
67
+ CUDA_R_4U = 18, /* real as a unsigned 4-bit int */
68
+ CUDA_C_4U = 19, /* complex as a pair of unsigned 4-bit int numbers */
69
+ CUDA_R_8I = 3, /* real as a signed 8-bit int */
70
+ CUDA_C_8I = 7, /* complex as a pair of signed 8-bit int numbers */
71
+ CUDA_R_8U = 8, /* real as a unsigned 8-bit int */
72
+ CUDA_C_8U = 9, /* complex as a pair of unsigned 8-bit int numbers */
73
+ CUDA_R_16I = 20, /* real as a signed 16-bit int */
74
+ CUDA_C_16I = 21, /* complex as a pair of signed 16-bit int numbers */
75
+ CUDA_R_16U = 22, /* real as a unsigned 16-bit int */
76
+ CUDA_C_16U = 23, /* complex as a pair of unsigned 16-bit int numbers */
77
+ CUDA_R_32I = 10, /* real as a signed 32-bit int */
78
+ CUDA_C_32I = 11, /* complex as a pair of signed 32-bit int numbers */
79
+ CUDA_R_32U = 12, /* real as a unsigned 32-bit int */
80
+ CUDA_C_32U = 13, /* complex as a pair of unsigned 32-bit int numbers */
81
+ CUDA_R_64I = 24, /* real as a signed 64-bit int */
82
+ CUDA_C_64I = 25, /* complex as a pair of signed 64-bit int numbers */
83
+ CUDA_R_64U = 26, /* real as a unsigned 64-bit int */
84
+ CUDA_C_64U = 27, /* complex as a pair of unsigned 64-bit int numbers */
85
+ CUDA_R_8F_E4M3 = 28, /* real as a nv_fp8_e4m3 */
86
+ CUDA_R_8F_E5M2 = 29, /* real as a nv_fp8_e5m2 */
87
+ } cudaDataType;
88
+
89
+
90
+ typedef enum libraryPropertyType_t
91
+ {
92
+ MAJOR_VERSION,
93
+ MINOR_VERSION,
94
+ PATCH_LEVEL
95
+ } libraryPropertyType;
96
+
97
+
98
+ #ifndef __cplusplus
99
+ typedef enum cudaDataType_t cudaDataType_t;
100
+ typedef enum libraryPropertyType_t libraryPropertyType_t;
101
+ #endif
102
+
103
+ #endif /* !__LIBRARY_TYPES_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_functions.h ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2018 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
51
+ #if defined(_MSC_VER)
52
+ #pragma message("math_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead.")
53
+ #else
54
+ #warning "math_functions.h is an internal header file and must not be used directly. This file will be removed in a future CUDA release. Please use cuda_runtime_api.h or cuda_runtime.h instead."
55
+ #endif
56
+ #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
57
+ #define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
58
+ #endif
59
+
60
+ #include "crt/math_functions.h"
61
+
62
+ #if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__)
63
+ #undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
64
+ #undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
65
+ #endif
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_32_atomic_functions.hpp ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.35.235 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_32_ATOMIC_FUNCTIONS_HPP__)
51
+ #define __SM_32_ATOMIC_FUNCTIONS_HPP__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__
55
+ #else /* !__CUDACC_RTC__ */
56
+ #define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #include "cuda_runtime_api.h"
70
+
71
+ /*******************************************************************************
72
+ * *
73
+ * *
74
+ * *
75
+ *******************************************************************************/
76
+
77
+ __SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val)
78
+ {
79
+ return __illAtomicMin(address, val);
80
+ }
81
+
82
+ __SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val)
83
+ {
84
+ return __illAtomicMax(address, val);
85
+ }
86
+
87
+ __SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val)
88
+ {
89
+ return __llAtomicAnd(address, val);
90
+ }
91
+
92
+ __SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val)
93
+ {
94
+ return __llAtomicOr(address, val);
95
+ }
96
+
97
+ __SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val)
98
+ {
99
+ return __llAtomicXor(address, val);
100
+ }
101
+
102
+ __SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val)
103
+ {
104
+ return __ullAtomicMin(address, val);
105
+ }
106
+
107
+ __SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val)
108
+ {
109
+ return __ullAtomicMax(address, val);
110
+ }
111
+
112
+ __SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val)
113
+ {
114
+ return __ullAtomicAnd(address, val);
115
+ }
116
+
117
+ __SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val)
118
+ {
119
+ return __ullAtomicOr(address, val);
120
+ }
121
+
122
+ __SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val)
123
+ {
124
+ return __ullAtomicXor(address, val);
125
+ }
126
+
127
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
128
+
129
+ #endif /* __cplusplus && __CUDACC__ */
130
+
131
+ #undef __SM_32_ATOMIC_FUNCTIONS_DECL__
132
+
133
+ #endif /* !__SM_32_ATOMIC_FUNCTIONS_HPP__ */
134
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_60_atomic_functions.hpp ADDED
@@ -0,0 +1,527 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2014 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SM_60_ATOMIC_FUNCTIONS_HPP__)
51
+ #define __SM_60_ATOMIC_FUNCTIONS_HPP__
52
+
53
+ #if defined(__CUDACC_RTC__)
54
+ #define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
55
+ #else /* __CUDACC_RTC__ */
56
+ #define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
57
+ #endif /* __CUDACC_RTC__ */
58
+
59
+ #if defined(__cplusplus) && defined(__CUDACC__)
60
+
61
+ #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
62
+
63
+ /*******************************************************************************
64
+ * *
65
+ * *
66
+ * *
67
+ *******************************************************************************/
68
+
69
+ #include "cuda_runtime_api.h"
70
+
71
+ /*******************************************************************************
72
+ * *
73
+ * *
74
+ * *
75
+ *******************************************************************************/
76
+
77
+ __SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val)
78
+ {
79
+ return __dAtomicAdd(address, val);
80
+ }
81
+
82
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
83
+ int atomicAdd_block(int *address, int val)
84
+ {
85
+ return __iAtomicAdd_block(address, val);
86
+ }
87
+
88
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
89
+ int atomicAdd_system(int *address, int val)
90
+ {
91
+ return __iAtomicAdd_system(address, val);
92
+ }
93
+
94
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
95
+ unsigned int atomicAdd_block(unsigned int *address, unsigned int val)
96
+ {
97
+ return __uAtomicAdd_block(address, val);
98
+ }
99
+
100
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
101
+ unsigned int atomicAdd_system(unsigned int *address, unsigned int val)
102
+ {
103
+ return __uAtomicAdd_system(address, val);
104
+ }
105
+
106
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
107
+ unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val)
108
+ {
109
+ return __ullAtomicAdd_block(address, val);
110
+ }
111
+
112
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
113
+ unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val)
114
+ {
115
+ return __ullAtomicAdd_system(address, val);
116
+ }
117
+
118
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
119
+ float atomicAdd_block(float *address, float val)
120
+ {
121
+ return __fAtomicAdd_block(address, val);
122
+ }
123
+
124
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
125
+ float atomicAdd_system(float *address, float val)
126
+ {
127
+ return __fAtomicAdd_system(address, val);
128
+ }
129
+
130
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
131
+ double atomicAdd_block(double *address, double val)
132
+ {
133
+ return __dAtomicAdd_block(address, val);
134
+ }
135
+
136
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
137
+ double atomicAdd_system(double *address, double val)
138
+ {
139
+ return __dAtomicAdd_system(address, val);
140
+ }
141
+
142
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
143
+ int atomicSub_block(int *address, int val)
144
+ {
145
+ return __iAtomicAdd_block(address, (unsigned int)-(int)val);
146
+ }
147
+
148
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
149
+ int atomicSub_system(int *address, int val)
150
+ {
151
+ return __iAtomicAdd_system(address, (unsigned int)-(int)val);
152
+ }
153
+
154
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
155
+ unsigned int atomicSub_block(unsigned int *address, unsigned int val)
156
+ {
157
+ return __uAtomicAdd_block(address, (unsigned int)-(int)val);
158
+ }
159
+
160
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
161
+ unsigned int atomicSub_system(unsigned int *address, unsigned int val)
162
+ {
163
+ return __uAtomicAdd_system(address, (unsigned int)-(int)val);
164
+ }
165
+
166
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
167
+ int atomicExch_block(int *address, int val)
168
+ {
169
+ return __iAtomicExch_block(address, val);
170
+ }
171
+
172
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
173
+ int atomicExch_system(int *address, int val)
174
+ {
175
+ return __iAtomicExch_system(address, val);
176
+ }
177
+
178
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
179
+ unsigned int atomicExch_block(unsigned int *address, unsigned int val)
180
+ {
181
+ return __uAtomicExch_block(address, val);
182
+ }
183
+
184
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
185
+ unsigned int atomicExch_system(unsigned int *address, unsigned int val)
186
+ {
187
+ return __uAtomicExch_system(address, val);
188
+ }
189
+
190
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
191
+ unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val)
192
+ {
193
+ return __ullAtomicExch_block(address, val);
194
+ }
195
+
196
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
197
+ unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val)
198
+ {
199
+ return __ullAtomicExch_system(address, val);
200
+ }
201
+
202
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
203
+ float atomicExch_block(float *address, float val)
204
+ {
205
+ return __fAtomicExch_block(address, val);
206
+ }
207
+
208
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
209
+ float atomicExch_system(float *address, float val)
210
+ {
211
+ return __fAtomicExch_system(address, val);
212
+ }
213
+
214
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
215
+ int atomicMin_block(int *address, int val)
216
+ {
217
+ return __iAtomicMin_block(address, val);
218
+ }
219
+
220
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
221
+ int atomicMin_system(int *address, int val)
222
+ {
223
+ return __iAtomicMin_system(address, val);
224
+ }
225
+
226
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
227
+ long long atomicMin_block(long long *address, long long val)
228
+ {
229
+ return __illAtomicMin_block(address, val);
230
+ }
231
+
232
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
233
+ long long atomicMin_system(long long *address, long long val)
234
+ {
235
+ return __illAtomicMin_system(address, val);
236
+ }
237
+
238
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
239
+ unsigned int atomicMin_block(unsigned int *address, unsigned int val)
240
+ {
241
+ return __uAtomicMin_block(address, val);
242
+ }
243
+
244
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
245
+ unsigned int atomicMin_system(unsigned int *address, unsigned int val)
246
+ {
247
+ return __uAtomicMin_system(address, val);
248
+ }
249
+
250
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
251
+ unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val)
252
+ {
253
+ return __ullAtomicMin_block(address, val);
254
+ }
255
+
256
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
257
+ unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val)
258
+ {
259
+ return __ullAtomicMin_system(address, val);
260
+ }
261
+
262
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
263
+ int atomicMax_block(int *address, int val)
264
+ {
265
+ return __iAtomicMax_block(address, val);
266
+ }
267
+
268
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
269
+ int atomicMax_system(int *address, int val)
270
+ {
271
+ return __iAtomicMax_system(address, val);
272
+ }
273
+
274
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
275
+ long long atomicMax_block(long long *address, long long val)
276
+ {
277
+ return __illAtomicMax_block(address, val);
278
+ }
279
+
280
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
281
+ long long atomicMax_system(long long *address, long long val)
282
+ {
283
+ return __illAtomicMax_system(address, val);
284
+ }
285
+
286
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
287
+ unsigned int atomicMax_block(unsigned int *address, unsigned int val)
288
+ {
289
+ return __uAtomicMax_block(address, val);
290
+ }
291
+
292
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
293
+ unsigned int atomicMax_system(unsigned int *address, unsigned int val)
294
+ {
295
+ return __uAtomicMax_system(address, val);
296
+ }
297
+
298
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
299
+ unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val)
300
+ {
301
+ return __ullAtomicMax_block(address, val);
302
+ }
303
+
304
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
305
+ unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val)
306
+ {
307
+ return __ullAtomicMax_system(address, val);
308
+ }
309
+
310
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
311
+ unsigned int atomicInc_block(unsigned int *address, unsigned int val)
312
+ {
313
+ return __uAtomicInc_block(address, val);
314
+ }
315
+
316
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
317
+ unsigned int atomicInc_system(unsigned int *address, unsigned int val)
318
+ {
319
+ return __uAtomicInc_system(address, val);
320
+ }
321
+
322
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
323
+ unsigned int atomicDec_block(unsigned int *address, unsigned int val)
324
+ {
325
+ return __uAtomicDec_block(address, val);
326
+ }
327
+
328
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
329
+ unsigned int atomicDec_system(unsigned int *address, unsigned int val)
330
+ {
331
+ return __uAtomicDec_system(address, val);
332
+ }
333
+
334
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
335
+ int atomicCAS_block(int *address, int compare, int val)
336
+ {
337
+ return __iAtomicCAS_block(address, compare, val);
338
+ }
339
+
340
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
341
+ int atomicCAS_system(int *address, int compare, int val)
342
+ {
343
+ return __iAtomicCAS_system(address, compare, val);
344
+ }
345
+
346
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
347
+ unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
348
+ unsigned int val)
349
+ {
350
+ return __uAtomicCAS_block(address, compare, val);
351
+ }
352
+
353
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
354
+ unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
355
+ unsigned int val)
356
+ {
357
+ return __uAtomicCAS_system(address, compare, val);
358
+ }
359
+
360
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
361
+ unsigned long long int atomicCAS_block(unsigned long long int *address,
362
+ unsigned long long int compare,
363
+ unsigned long long int val)
364
+ {
365
+ return __ullAtomicCAS_block(address, compare, val);
366
+ }
367
+
368
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
369
+ unsigned long long int atomicCAS_system(unsigned long long int *address,
370
+ unsigned long long int compare,
371
+ unsigned long long int val)
372
+ {
373
+ return __ullAtomicCAS_system(address, compare, val);
374
+ }
375
+
376
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
377
+ int atomicAnd_block(int *address, int val)
378
+ {
379
+ return __iAtomicAnd_block(address, val);
380
+ }
381
+
382
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
383
+ int atomicAnd_system(int *address, int val)
384
+ {
385
+ return __iAtomicAnd_system(address, val);
386
+ }
387
+
388
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
389
+ long long atomicAnd_block(long long *address, long long val)
390
+ {
391
+ return __llAtomicAnd_block(address, val);
392
+ }
393
+
394
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
395
+ long long atomicAnd_system(long long *address, long long val)
396
+ {
397
+ return __llAtomicAnd_system(address, val);
398
+ }
399
+
400
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
401
+ unsigned int atomicAnd_block(unsigned int *address, unsigned int val)
402
+ {
403
+ return __uAtomicAnd_block(address, val);
404
+ }
405
+
406
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
407
+ unsigned int atomicAnd_system(unsigned int *address, unsigned int val)
408
+ {
409
+ return __uAtomicAnd_system(address, val);
410
+ }
411
+
412
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
413
+ unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val)
414
+ {
415
+ return __ullAtomicAnd_block(address, val);
416
+ }
417
+
418
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
419
+ unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val)
420
+ {
421
+ return __ullAtomicAnd_system(address, val);
422
+ }
423
+
424
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
425
+ int atomicOr_block(int *address, int val)
426
+ {
427
+ return __iAtomicOr_block(address, val);
428
+ }
429
+
430
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
431
+ int atomicOr_system(int *address, int val)
432
+ {
433
+ return __iAtomicOr_system(address, val);
434
+ }
435
+
436
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
437
+ long long atomicOr_block(long long *address, long long val)
438
+ {
439
+ return __llAtomicOr_block(address, val);
440
+ }
441
+
442
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
443
+ long long atomicOr_system(long long *address, long long val)
444
+ {
445
+ return __llAtomicOr_system(address, val);
446
+ }
447
+
448
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
449
+ unsigned int atomicOr_block(unsigned int *address, unsigned int val)
450
+ {
451
+ return __uAtomicOr_block(address, val);
452
+ }
453
+
454
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
455
+ unsigned int atomicOr_system(unsigned int *address, unsigned int val)
456
+ {
457
+ return __uAtomicOr_system(address, val);
458
+ }
459
+
460
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
461
+ unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val)
462
+ {
463
+ return __ullAtomicOr_block(address, val);
464
+ }
465
+
466
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
467
+ unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val)
468
+ {
469
+ return __ullAtomicOr_system(address, val);
470
+ }
471
+
472
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
473
+ int atomicXor_block(int *address, int val)
474
+ {
475
+ return __iAtomicXor_block(address, val);
476
+ }
477
+
478
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
479
+ int atomicXor_system(int *address, int val)
480
+ {
481
+ return __iAtomicXor_system(address, val);
482
+ }
483
+
484
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
485
+ long long atomicXor_block(long long *address, long long val)
486
+ {
487
+ return __llAtomicXor_block(address, val);
488
+ }
489
+
490
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
491
+ long long atomicXor_system(long long *address, long long val)
492
+ {
493
+ return __llAtomicXor_system(address, val);
494
+ }
495
+
496
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
497
+ unsigned int atomicXor_block(unsigned int *address, unsigned int val)
498
+ {
499
+ return __uAtomicXor_block(address, val);
500
+ }
501
+
502
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
503
+ unsigned int atomicXor_system(unsigned int *address, unsigned int val)
504
+ {
505
+ return __uAtomicXor_system(address, val);
506
+ }
507
+
508
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
509
+ unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val)
510
+ {
511
+ return __ullAtomicXor_block(address, val);
512
+ }
513
+
514
+ __SM_60_ATOMIC_FUNCTIONS_DECL__
515
+ unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val)
516
+ {
517
+ return __ullAtomicXor_system(address, val);
518
+ }
519
+
520
+ #endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
521
+
522
+ #endif /* __cplusplus && __CUDACC__ */
523
+
524
+ #undef __SM_60_ATOMIC_FUNCTIONS_DECL__
525
+
526
+ #endif /* !__SM_60_ATOMIC_FUNCTIONS_HPP__ */
527
+
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_functions.h ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__SURFACE_FUNCTIONS_H__)
51
+ #define __SURFACE_FUNCTIONS_H__
52
+
53
+ #if defined(__cplusplus) && defined(__CUDACC__)
54
+
55
+ /*******************************************************************************
56
+ * *
57
+ * *
58
+ * *
59
+ *******************************************************************************/
60
+
61
+ #include "cuda_runtime_api.h"
62
+ #include "cuda_surface_types.h"
63
+
64
+ #if defined(_WIN32)
65
+ # define __DEPRECATED__ __declspec(deprecated)
66
+ #else
67
+ # define __DEPRECATED__ __attribute__((deprecated))
68
+ #endif
69
+
70
+
71
+
72
+ #ifdef __CUDA_ARCH__
73
+ template <typename T> struct __nv_surf_trait { typedef void * cast_type; };
74
+
75
+ template<> struct __nv_surf_trait<char> { typedef char * cast_type; };
76
+ template<> struct __nv_surf_trait<signed char> { typedef signed char * cast_type; };
77
+ template<> struct __nv_surf_trait<unsigned char> { typedef unsigned char * cast_type; };
78
+ template<> struct __nv_surf_trait<char1> { typedef char1 * cast_type; };
79
+ template<> struct __nv_surf_trait<uchar1> { typedef uchar1 * cast_type; };
80
+ template<> struct __nv_surf_trait<char2> { typedef char2 * cast_type; };
81
+ template<> struct __nv_surf_trait<uchar2> { typedef uchar2 * cast_type; };
82
+ template<> struct __nv_surf_trait<char4> { typedef char4 * cast_type; };
83
+ template<> struct __nv_surf_trait<uchar4> { typedef uchar4 * cast_type; };
84
+ template<> struct __nv_surf_trait<short> { typedef short * cast_type; };
85
+ template<> struct __nv_surf_trait<unsigned short> { typedef unsigned short * cast_type; };
86
+ template<> struct __nv_surf_trait<short1> { typedef short1 * cast_type; };
87
+ template<> struct __nv_surf_trait<ushort1> { typedef ushort1 * cast_type; };
88
+ template<> struct __nv_surf_trait<short2> { typedef short2 * cast_type; };
89
+ template<> struct __nv_surf_trait<ushort2> { typedef ushort2 * cast_type; };
90
+ template<> struct __nv_surf_trait<short4> { typedef short4 * cast_type; };
91
+ template<> struct __nv_surf_trait<ushort4> { typedef ushort4 * cast_type; };
92
+ template<> struct __nv_surf_trait<int> { typedef int * cast_type; };
93
+ template<> struct __nv_surf_trait<unsigned int> { typedef unsigned int * cast_type; };
94
+ template<> struct __nv_surf_trait<int1> { typedef int1 * cast_type; };
95
+ template<> struct __nv_surf_trait<uint1> { typedef uint1 * cast_type; };
96
+ template<> struct __nv_surf_trait<int2> { typedef int2 * cast_type; };
97
+ template<> struct __nv_surf_trait<uint2> { typedef uint2 * cast_type; };
98
+ template<> struct __nv_surf_trait<int4> { typedef int4 * cast_type; };
99
+ template<> struct __nv_surf_trait<uint4> { typedef uint4 * cast_type; };
100
+ template<> struct __nv_surf_trait<long long> { typedef long long * cast_type; };
101
+ template<> struct __nv_surf_trait<unsigned long long> { typedef unsigned long long * cast_type; };
102
+ template<> struct __nv_surf_trait<longlong1> { typedef longlong1 * cast_type; };
103
+ template<> struct __nv_surf_trait<ulonglong1> { typedef ulonglong1 * cast_type; };
104
+ template<> struct __nv_surf_trait<longlong2> { typedef longlong2 * cast_type; };
105
+ template<> struct __nv_surf_trait<ulonglong2> { typedef ulonglong2 * cast_type; };
106
+ #if !defined(__LP64__)
107
+ template<> struct __nv_surf_trait<long> { typedef int * cast_type; };
108
+ template<> struct __nv_surf_trait<unsigned long> { typedef unsigned int * cast_type; };
109
+ template<> struct __nv_surf_trait<long1> { typedef int1 * cast_type; };
110
+ template<> struct __nv_surf_trait<ulong1> { typedef uint1 * cast_type; };
111
+ template<> struct __nv_surf_trait<long2> { typedef int2 * cast_type; };
112
+ template<> struct __nv_surf_trait<ulong2> { typedef uint2 * cast_type; };
113
+ template<> struct __nv_surf_trait<long4> { typedef uint4 * cast_type; };
114
+ template<> struct __nv_surf_trait<ulong4> { typedef int4 * cast_type; };
115
+ #endif
116
+ template<> struct __nv_surf_trait<float> { typedef float * cast_type; };
117
+ template<> struct __nv_surf_trait<float1> { typedef float1 * cast_type; };
118
+ template<> struct __nv_surf_trait<float2> { typedef float2 * cast_type; };
119
+ template<> struct __nv_surf_trait<float4> { typedef float4 * cast_type; };
120
+ #endif /* defined(__CUDA_ARCH__) */
121
+
122
+ template <typename T>
123
+ static __DEPRECATED__ __device__ __forceinline__ void surf1Dread(T *res, surface<void, cudaSurfaceType1D> surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
124
+ {
125
+ #ifdef __CUDA_ARCH__
126
+ __nv_tex_surf_handler("__surf1Dread_v2", (void *)res, s, surf, x, mode);
127
+ #endif
128
+ }
129
+
130
+ template<class T>
131
+ static __DEPRECATED__ __device__ __forceinline__ T surf1Dread(surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
132
+ {
133
+ #ifdef __CUDA_ARCH__
134
+ T temp;
135
+ __nv_tex_surf_handler("__surf1Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, mode);
136
+ return temp;
137
+ #endif
138
+ }
139
+
140
+ template<class T>
141
+ static __DEPRECATED__ __device__ __forceinline__ void surf1Dread(T *res, surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
142
+ {
143
+ #ifdef __CUDA_ARCH__
144
+ *res = surf1Dread<T>(surf, x, mode);
145
+ #endif /* __CUDA_ARCH__ */
146
+ }
147
+
148
+
149
+ template <typename T>
150
+ static __DEPRECATED__ __device__ __forceinline__ void surf2Dread(T *res, surface<void, cudaSurfaceType2D> surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
151
+ {
152
+ #ifdef __CUDA_ARCH__
153
+ __nv_tex_surf_handler("__surf2Dread_v2", (void *)res, s, surf, x, y, mode);
154
+ #endif
155
+ }
156
+
157
+ template<class T>
158
+ static __DEPRECATED__ __device__ __forceinline__ T surf2Dread(surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
159
+ {
160
+ #ifdef __CUDA_ARCH__
161
+ T temp;
162
+ __nv_tex_surf_handler("__surf2Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, mode);
163
+ return temp;
164
+ #endif
165
+ }
166
+
167
+ template<class T>
168
+ static __DEPRECATED__ __device__ __forceinline__ void surf2Dread(T *res, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
169
+ {
170
+ #ifdef __CUDA_ARCH__
171
+ *res = surf2Dread<T>(surf, x, y, mode);
172
+ #endif /* __CUDA_ARCH__ */
173
+ }
174
+
175
+
176
+ template <typename T>
177
+ static __DEPRECATED__ __device__ __forceinline__ void surf3Dread(T *res, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
178
+ {
179
+ #ifdef __CUDA_ARCH__
180
+ __nv_tex_surf_handler("__surf3Dread_v2", (void *)res, s, surf, x, y, z, mode);
181
+ #endif
182
+ }
183
+
184
+ template<class T>
185
+ static __DEPRECATED__ __device__ __forceinline__ T surf3Dread(surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
186
+ {
187
+ #ifdef __CUDA_ARCH__
188
+ T temp;
189
+ __nv_tex_surf_handler("__surf3Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, z, mode);
190
+ return temp;
191
+ #endif
192
+ }
193
+
194
+ template<class T>
195
+ static __DEPRECATED__ __device__ __forceinline__ void surf3Dread(T *res, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
196
+ {
197
+ #ifdef __CUDA_ARCH__
198
+ *res = surf3Dread<T>(surf, x, y, z, mode);
199
+ #endif /* __CUDA_ARCH__ */
200
+ }
201
+
202
+
203
+
204
+ template <typename T>
205
+ static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredread(T *res, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
206
+ {
207
+ #ifdef __CUDA_ARCH__
208
+ __nv_tex_surf_handler("__surf1DLayeredread_v2", (void *)res, s, surf, x, layer, mode);
209
+ #endif
210
+ }
211
+
212
+ template<class T>
213
+ static __DEPRECATED__ __device__ __forceinline__ T surf1DLayeredread(surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
214
+ {
215
+ #ifdef __CUDA_ARCH__
216
+ T temp;
217
+ __nv_tex_surf_handler("__surf1DLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, layer, mode);
218
+ return temp;
219
+ #endif
220
+ }
221
+
222
+
223
+ template<class T>
224
+ static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredread(T *res, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
225
+ {
226
+ #ifdef __CUDA_ARCH__
227
+ *res = surf1DLayeredread<T>(surf, x, layer, mode);
228
+ #endif /* __CUDA_ARCH__ */
229
+ }
230
+
231
+
232
+ template <typename T>
233
+ static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredread(T *res, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
234
+ {
235
+ #ifdef __CUDA_ARCH__
236
+ __nv_tex_surf_handler("__surf2DLayeredread_v2", (void *)res, s, surf, x, y, layer, mode);
237
+ #endif
238
+ }
239
+
240
+ template<class T>
241
+ static __DEPRECATED__ __device__ __forceinline__ T surf2DLayeredread(surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
242
+ {
243
+ #ifdef __CUDA_ARCH__
244
+ T temp;
245
+ __nv_tex_surf_handler("__surf2DLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, layer, mode);
246
+ return temp;
247
+ #endif
248
+ }
249
+
250
+
251
+ template<class T>
252
+ static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredread(T *res, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
253
+ {
254
+ #ifdef __CUDA_ARCH__
255
+ *res = surf2DLayeredread<T>(surf, x, y, layer, mode);
256
+ #endif /* __CUDA_ARCH__ */
257
+ }
258
+
259
+
260
+ template <typename T>
261
+ static __device__ __forceinline__ void surfCubemapread(T *res, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
262
+ {
263
+ #ifdef __CUDA_ARCH__
264
+ __nv_tex_surf_handler("__surfCubemapread_v2", (void *)res, s, surf, x, y, face, mode);
265
+ #endif
266
+ }
267
+
268
+ template<class T>
269
+ static __DEPRECATED__ __device__ __forceinline__ T surfCubemapread(surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
270
+ {
271
+ #ifdef __CUDA_ARCH__
272
+ T temp;
273
+
274
+ __nv_tex_surf_handler("__surfCubemapread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, face, mode);
275
+ return temp;
276
+ #endif
277
+ }
278
+
279
+ template<class T>
280
+ static __DEPRECATED__ __device__ __forceinline__ void surfCubemapread(T *res, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
281
+ {
282
+ #ifdef __CUDA_ARCH__
283
+ *res = surfCubemapread<T>(surf, x, y, face, mode);
284
+ #endif /* __CUDA_ARCH__ */
285
+ }
286
+
287
+
288
+ template <typename T>
289
+ static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredread(T *res, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
290
+ {
291
+ #ifdef __CUDA_ARCH__
292
+ __nv_tex_surf_handler("__surfCubemapLayeredread_v2", (void *)res, s, surf, x, y, layerFace, mode);
293
+ #endif
294
+ }
295
+
296
+ template<class T>
297
+ static __DEPRECATED__ __device__ __forceinline__ T surfCubemapLayeredread(surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
298
+ {
299
+ #ifdef __CUDA_ARCH__
300
+ T temp;
301
+ __nv_tex_surf_handler("__surfCubemapLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, layerFace, mode);
302
+ return temp;
303
+ #endif
304
+ }
305
+
306
+ template<class T>
307
+ static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredread(T *res, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
308
+ {
309
+ #ifdef __CUDA_ARCH__
310
+ *res = surfCubemapLayeredread<T>(surf, x, y, layerFace, mode);
311
+ #endif /* __CUDA_ARCH__ */
312
+ }
313
+
314
+ //surf1Dwrite
315
+ template<class T>
316
+ static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface<void, cudaSurfaceType1D> surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
317
+ {
318
+ #ifdef __CUDA_ARCH__
319
+ __nv_tex_surf_handler("__surf1Dwrite_v2", (void *)&val, s, surf, x, mode);
320
+ #endif
321
+ }
322
+
323
+ template<class T>
324
+ static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
325
+ {
326
+ #ifdef __CUDA_ARCH__
327
+ __nv_tex_surf_handler("__surf1Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, mode);
328
+ #endif /* __CUDA_ARCH__ */
329
+ }
330
+
331
+
332
+ //surf2Dwrite
333
+ template<class T>
334
+ static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
335
+ {
336
+ #ifdef __CUDA_ARCH__
337
+ __nv_tex_surf_handler("__surf2Dwrite_v2", (void *)&val, s, surf, x, y, mode);
338
+ #endif
339
+ }
340
+
341
+ template<class T>
342
+ static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
343
+ {
344
+ #ifdef __CUDA_ARCH__
345
+ __nv_tex_surf_handler("__surf2Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, mode);
346
+ #endif /* __CUDA_ARCH__ */
347
+ }
348
+
349
+ //surf3Dwrite
350
+ template<class T>
351
+ static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
352
+ {
353
+ #ifdef __CUDA_ARCH__
354
+ __nv_tex_surf_handler("__surf3Dwrite_v2", (void *)&val, s, surf, x, y, z,mode);
355
+ #endif
356
+ }
357
+
358
+ template<class T>
359
+ static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
360
+ {
361
+ #ifdef __CUDA_ARCH__
362
+ __nv_tex_surf_handler("__surf3Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, z, mode);
363
+ #endif /* __CUDA_ARCH__ */
364
+ }
365
+
366
+ //surf1DLayeredwrite
367
+ template<class T>
368
+ static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
369
+ {
370
+ #ifdef __CUDA_ARCH__
371
+ __nv_tex_surf_handler("__surf1DLayeredwrite_v2", (void *)&val, s, surf, x, layer,mode);
372
+ #endif
373
+ }
374
+
375
+ template<class T>
376
+ static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
377
+ {
378
+ #ifdef __CUDA_ARCH__
379
+ __nv_tex_surf_handler("__surf1DLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, layer, mode);
380
+ #endif /* __CUDA_ARCH__ */
381
+ }
382
+
383
+ //surf2DLayeredwrite
384
+ template<class T>
385
+ static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
386
+ {
387
+ #ifdef __CUDA_ARCH__
388
+ __nv_tex_surf_handler("__surf2DLayeredwrite_v2", (void *)&val, s, surf, x, y, layer,mode);
389
+ #endif
390
+ }
391
+
392
+ template<class T>
393
+ static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
394
+ {
395
+ #ifdef __CUDA_ARCH__
396
+ __nv_tex_surf_handler("__surf2DLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, layer, mode);
397
+ #endif /* __CUDA_ARCH__ */
398
+ }
399
+
400
+ //surfCubemapwrite
401
+ template<class T>
402
+ static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
403
+ {
404
+ #ifdef __CUDA_ARCH__
405
+ __nv_tex_surf_handler("__surfCubemapwrite_v2", (void *)&val, s, surf, x, y, face, mode);
406
+ #endif
407
+ }
408
+
409
+ template<class T>
410
+ static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
411
+ {
412
+ #ifdef __CUDA_ARCH__
413
+ __nv_tex_surf_handler("__surfCubemapwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, face, mode);
414
+ #endif /* __CUDA_ARCH__ */
415
+ }
416
+
417
+
418
+ //surfCubemapLayeredwrite
419
+ template<class T>
420
+ static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
421
+ {
422
+ #ifdef __CUDA_ARCH__
423
+ __nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (void *)&val, s, surf, x, y, layerFace, mode);
424
+ #endif
425
+ }
426
+
427
+ template<class T>
428
+ static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
429
+ {
430
+ #ifdef __CUDA_ARCH__
431
+ __nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, layerFace, mode);
432
+ #endif /* __CUDA_ARCH__ */
433
+ }
434
+
435
+ #undef __DEPRECATED__
436
+
437
+
438
+ #endif /* __cplusplus && __CUDACC__ */
439
+ #endif /* !__SURFACE_FUNCTIONS_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_fetch_functions.h ADDED
@@ -0,0 +1,739 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 1993-2017 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ #if !defined(__TEXTURE_FETCH_FUNCTIONS_H__)
51
+ #define __TEXTURE_FETCH_FUNCTIONS_H__
52
+
53
+
54
+ #if defined(__cplusplus) && defined(__CUDACC__)
55
+
56
+ /*******************************************************************************
57
+ * *
58
+ * *
59
+ * *
60
+ *******************************************************************************/
61
+
62
+ #include "cuda_runtime_api.h"
63
+ #include "cuda_texture_types.h"
64
+
65
+ #if defined(_WIN32)
66
+ # define __DEPRECATED__ __declspec(deprecated)
67
+ #else
68
+ # define __DEPRECATED__ __attribute__((deprecated))
69
+ #endif
70
+
71
+
72
+ template <typename T>
73
+ struct __nv_tex_rmet_ret { };
74
+
75
+ template<> struct __nv_tex_rmet_ret<char> { typedef char type; };
76
+ template<> struct __nv_tex_rmet_ret<signed char> { typedef signed char type; };
77
+ template<> struct __nv_tex_rmet_ret<unsigned char> { typedef unsigned char type; };
78
+ template<> struct __nv_tex_rmet_ret<char1> { typedef char1 type; };
79
+ template<> struct __nv_tex_rmet_ret<uchar1> { typedef uchar1 type; };
80
+ template<> struct __nv_tex_rmet_ret<char2> { typedef char2 type; };
81
+ template<> struct __nv_tex_rmet_ret<uchar2> { typedef uchar2 type; };
82
+ template<> struct __nv_tex_rmet_ret<char4> { typedef char4 type; };
83
+ template<> struct __nv_tex_rmet_ret<uchar4> { typedef uchar4 type; };
84
+
85
+ template<> struct __nv_tex_rmet_ret<short> { typedef short type; };
86
+ template<> struct __nv_tex_rmet_ret<unsigned short> { typedef unsigned short type; };
87
+ template<> struct __nv_tex_rmet_ret<short1> { typedef short1 type; };
88
+ template<> struct __nv_tex_rmet_ret<ushort1> { typedef ushort1 type; };
89
+ template<> struct __nv_tex_rmet_ret<short2> { typedef short2 type; };
90
+ template<> struct __nv_tex_rmet_ret<ushort2> { typedef ushort2 type; };
91
+ template<> struct __nv_tex_rmet_ret<short4> { typedef short4 type; };
92
+ template<> struct __nv_tex_rmet_ret<ushort4> { typedef ushort4 type; };
93
+
94
+ template<> struct __nv_tex_rmet_ret<int> { typedef int type; };
95
+ template<> struct __nv_tex_rmet_ret<unsigned int> { typedef unsigned int type; };
96
+ template<> struct __nv_tex_rmet_ret<int1> { typedef int1 type; };
97
+ template<> struct __nv_tex_rmet_ret<uint1> { typedef uint1 type; };
98
+ template<> struct __nv_tex_rmet_ret<int2> { typedef int2 type; };
99
+ template<> struct __nv_tex_rmet_ret<uint2> { typedef uint2 type; };
100
+ template<> struct __nv_tex_rmet_ret<int4> { typedef int4 type; };
101
+ template<> struct __nv_tex_rmet_ret<uint4> { typedef uint4 type; };
102
+
103
+ #if !defined(__LP64__)
104
+ template<> struct __nv_tex_rmet_ret<long> { typedef long type; };
105
+ template<> struct __nv_tex_rmet_ret<unsigned long> { typedef unsigned long type; };
106
+ template<> struct __nv_tex_rmet_ret<long1> { typedef long1 type; };
107
+ template<> struct __nv_tex_rmet_ret<ulong1> { typedef ulong1 type; };
108
+ template<> struct __nv_tex_rmet_ret<long2> { typedef long2 type; };
109
+ template<> struct __nv_tex_rmet_ret<ulong2> { typedef ulong2 type; };
110
+ template<> struct __nv_tex_rmet_ret<long4> { typedef long4 type; };
111
+ template<> struct __nv_tex_rmet_ret<ulong4> { typedef ulong4 type; };
112
+ #endif /* !__LP64__ */
113
+ template<> struct __nv_tex_rmet_ret<float> { typedef float type; };
114
+ template<> struct __nv_tex_rmet_ret<float1> { typedef float1 type; };
115
+ template<> struct __nv_tex_rmet_ret<float2> { typedef float2 type; };
116
+ template<> struct __nv_tex_rmet_ret<float4> { typedef float4 type; };
117
+
118
+
119
+ template <typename T> struct __nv_tex_rmet_cast { typedef T* type; };
120
+ #if !defined(__LP64__)
121
+ template<> struct __nv_tex_rmet_cast<long> { typedef int *type; };
122
+ template<> struct __nv_tex_rmet_cast<unsigned long> { typedef unsigned int *type; };
123
+ template<> struct __nv_tex_rmet_cast<long1> { typedef int1 *type; };
124
+ template<> struct __nv_tex_rmet_cast<ulong1> { typedef uint1 *type; };
125
+ template<> struct __nv_tex_rmet_cast<long2> { typedef int2 *type; };
126
+ template<> struct __nv_tex_rmet_cast<ulong2> { typedef uint2 *type; };
127
+ template<> struct __nv_tex_rmet_cast<long4> { typedef int4 *type; };
128
+ template<> struct __nv_tex_rmet_cast<ulong4> { typedef uint4 *type; };
129
+ #endif /* !__LP64__ */
130
+
131
+ template <typename T>
132
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1Dfetch(texture<T, cudaTextureType1D, cudaReadModeElementType> t, int x)
133
+ {
134
+ #ifdef __CUDA_ARCH__
135
+ typename __nv_tex_rmet_ret<T>::type temp;
136
+ __nv_tex_surf_handler("__tex1Dfetch_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x);
137
+ return temp;
138
+ #endif
139
+ }
140
+
141
+ template <typename T>
142
+ struct __nv_tex_rmnf_ret { };
143
+
144
+ template <> struct __nv_tex_rmnf_ret<char> { typedef float type; };
145
+ template <> struct __nv_tex_rmnf_ret<signed char> { typedef float type; };
146
+ template <> struct __nv_tex_rmnf_ret<unsigned char> { typedef float type; };
147
+ template <> struct __nv_tex_rmnf_ret<short> { typedef float type; };
148
+ template <> struct __nv_tex_rmnf_ret<unsigned short> { typedef float type; };
149
+ template <> struct __nv_tex_rmnf_ret<char1> { typedef float1 type; };
150
+ template <> struct __nv_tex_rmnf_ret<uchar1> { typedef float1 type; };
151
+ template <> struct __nv_tex_rmnf_ret<short1> { typedef float1 type; };
152
+ template <> struct __nv_tex_rmnf_ret<ushort1> { typedef float1 type; };
153
+ template <> struct __nv_tex_rmnf_ret<char2> { typedef float2 type; };
154
+ template <> struct __nv_tex_rmnf_ret<uchar2> { typedef float2 type; };
155
+ template <> struct __nv_tex_rmnf_ret<short2> { typedef float2 type; };
156
+ template <> struct __nv_tex_rmnf_ret<ushort2> { typedef float2 type; };
157
+ template <> struct __nv_tex_rmnf_ret<char4> { typedef float4 type; };
158
+ template <> struct __nv_tex_rmnf_ret<uchar4> { typedef float4 type; };
159
+ template <> struct __nv_tex_rmnf_ret<short4> { typedef float4 type; };
160
+ template <> struct __nv_tex_rmnf_ret<ushort4> { typedef float4 type; };
161
+
162
+ template <typename T>
163
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1Dfetch(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, int x)
164
+ {
165
+ #ifdef __CUDA_ARCH__
166
+ T type_dummy;
167
+ typename __nv_tex_rmnf_ret<T>::type retval;
168
+ __nv_tex_surf_handler("__tex1Dfetch_rmnf_v2", &type_dummy, &retval, t, x);
169
+ return retval;
170
+ #endif /* __CUDA_ARCH__ */
171
+ }
172
+
173
+ // tex1D
174
+ template <typename T>
175
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1D(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x)
176
+ {
177
+ #ifdef __CUDA_ARCH__
178
+ typename __nv_tex_rmet_ret<T>::type temp;
179
+ __nv_tex_surf_handler("__tex1D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x);
180
+ return temp;
181
+ #endif
182
+ }
183
+
184
+ template <typename T>
185
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1D(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x)
186
+ {
187
+ #ifdef __CUDA_ARCH__
188
+ T type_dummy;
189
+ typename __nv_tex_rmnf_ret<T>::type retval;
190
+ __nv_tex_surf_handler("__tex1D_rmnf_v2", &type_dummy, &retval, t, x);
191
+ return retval;
192
+ #endif /* __CUDA_ARCH__ */
193
+ }
194
+
195
+
196
+ //tex2D
197
+ template <typename T>
198
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2D(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y)
199
+ {
200
+ #ifdef __CUDA_ARCH__
201
+ typename __nv_tex_rmet_ret<T>::type temp;
202
+
203
+ __nv_tex_surf_handler("__tex2D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y);
204
+ return temp;
205
+ #endif
206
+ }
207
+
208
+ template <typename T>
209
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2D(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y)
210
+ {
211
+ #ifdef __CUDA_ARCH__
212
+ T type_dummy;
213
+ typename __nv_tex_rmnf_ret<T>::type retval;
214
+ __nv_tex_surf_handler("__tex2D_rmnf_v2", &type_dummy, &retval, t, x, y);
215
+ return retval;
216
+ #endif /* __CUDA_ARCH__ */
217
+ }
218
+
219
+
220
+ //tex1DLayered
221
+ template <typename T>
222
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayered(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer)
223
+ {
224
+ #ifdef __CUDA_ARCH__
225
+ typename __nv_tex_rmet_ret<T>::type temp;
226
+ __nv_tex_surf_handler("__tex1DLayered_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, layer);
227
+ return temp;
228
+ #endif
229
+ }
230
+
231
+ template <typename T>
232
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayered(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer)
233
+ {
234
+ #ifdef __CUDA_ARCH__
235
+ T type_dummy;
236
+ typename __nv_tex_rmnf_ret<T>::type retval;
237
+ __nv_tex_surf_handler("__tex1DLayered_rmnf_v2", &type_dummy, &retval, t, x, layer);
238
+ return retval;
239
+ #endif /* __CUDA_ARCH__ */
240
+ }
241
+
242
+
243
+ //tex2DLayered
244
+ template <typename T>
245
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayered(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer)
246
+ {
247
+ #ifdef __CUDA_ARCH__
248
+ typename __nv_tex_rmet_ret<T>::type temp;
249
+ __nv_tex_surf_handler("__tex2DLayered_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, layer);
250
+ return temp;
251
+ #endif
252
+ }
253
+
254
+ template <typename T>
255
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayered(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer)
256
+ {
257
+ #ifdef __CUDA_ARCH__
258
+ T type_dummy;
259
+ typename __nv_tex_rmnf_ret<T>::type retval;
260
+ __nv_tex_surf_handler("__tex2DLayered_rmnf_v2", &type_dummy, &retval, t, x, y, layer);
261
+ return retval;
262
+ #endif /* __CUDA_ARCH__ */
263
+ }
264
+
265
+ // tex3D
266
+ template <typename T>
267
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3D(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z)
268
+ {
269
+ #ifdef __CUDA_ARCH__
270
+ typename __nv_tex_rmet_ret<T>::type temp;
271
+ __nv_tex_surf_handler("__tex3D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, z);
272
+ return temp;
273
+ #endif
274
+ }
275
+
276
+ template <typename T>
277
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3D(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z)
278
+ {
279
+ #ifdef __CUDA_ARCH__
280
+ T type_dummy;
281
+ typename __nv_tex_rmnf_ret<T>::type retval;
282
+ __nv_tex_surf_handler("__tex3D_rmnf_v2", &type_dummy, &retval, t, x, y, z);
283
+ return retval;
284
+ #endif /* __CUDA_ARCH__ */
285
+ }
286
+
287
+ // texCubemap
288
+ template <typename T>
289
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemap(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z)
290
+ {
291
+ #ifdef __CUDA_ARCH__
292
+ typename __nv_tex_rmet_ret<T>::type temp;
293
+ __nv_tex_surf_handler("__texCubemap_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, z);
294
+ return temp;
295
+ #endif
296
+ }
297
+
298
+ template <typename T>
299
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemap(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z)
300
+ {
301
+ #ifdef __CUDA_ARCH__
302
+ T type_dummy;
303
+ typename __nv_tex_rmnf_ret<T>::type retval;
304
+ __nv_tex_surf_handler("__texCubemap_rmnf_v2", &type_dummy, &retval, t, x, y, z);
305
+ return retval;
306
+ #endif /* __CUDA_ARCH__ */
307
+ }
308
+
309
+
310
+ template <typename T>
311
+ struct __nv_tex2dgather_ret { };
312
+ template <> struct __nv_tex2dgather_ret<char> { typedef char4 type; };
313
+ template <> struct __nv_tex2dgather_ret<signed char> { typedef char4 type; };
314
+ template <> struct __nv_tex2dgather_ret<char1> { typedef char4 type; };
315
+ template <> struct __nv_tex2dgather_ret<char2> { typedef char4 type; };
316
+ template <> struct __nv_tex2dgather_ret<char3> { typedef char4 type; };
317
+ template <> struct __nv_tex2dgather_ret<char4> { typedef char4 type; };
318
+ template <> struct __nv_tex2dgather_ret<unsigned char> { typedef uchar4 type; };
319
+ template <> struct __nv_tex2dgather_ret<uchar1> { typedef uchar4 type; };
320
+ template <> struct __nv_tex2dgather_ret<uchar2> { typedef uchar4 type; };
321
+ template <> struct __nv_tex2dgather_ret<uchar3> { typedef uchar4 type; };
322
+ template <> struct __nv_tex2dgather_ret<uchar4> { typedef uchar4 type; };
323
+
324
+ template <> struct __nv_tex2dgather_ret<short> { typedef short4 type; };
325
+ template <> struct __nv_tex2dgather_ret<short1> { typedef short4 type; };
326
+ template <> struct __nv_tex2dgather_ret<short2> { typedef short4 type; };
327
+ template <> struct __nv_tex2dgather_ret<short3> { typedef short4 type; };
328
+ template <> struct __nv_tex2dgather_ret<short4> { typedef short4 type; };
329
+ template <> struct __nv_tex2dgather_ret<unsigned short> { typedef ushort4 type; };
330
+ template <> struct __nv_tex2dgather_ret<ushort1> { typedef ushort4 type; };
331
+ template <> struct __nv_tex2dgather_ret<ushort2> { typedef ushort4 type; };
332
+ template <> struct __nv_tex2dgather_ret<ushort3> { typedef ushort4 type; };
333
+ template <> struct __nv_tex2dgather_ret<ushort4> { typedef ushort4 type; };
334
+
335
+ template <> struct __nv_tex2dgather_ret<int> { typedef int4 type; };
336
+ template <> struct __nv_tex2dgather_ret<int1> { typedef int4 type; };
337
+ template <> struct __nv_tex2dgather_ret<int2> { typedef int4 type; };
338
+ template <> struct __nv_tex2dgather_ret<int3> { typedef int4 type; };
339
+ template <> struct __nv_tex2dgather_ret<int4> { typedef int4 type; };
340
+ template <> struct __nv_tex2dgather_ret<unsigned int> { typedef uint4 type; };
341
+ template <> struct __nv_tex2dgather_ret<uint1> { typedef uint4 type; };
342
+ template <> struct __nv_tex2dgather_ret<uint2> { typedef uint4 type; };
343
+ template <> struct __nv_tex2dgather_ret<uint3> { typedef uint4 type; };
344
+ template <> struct __nv_tex2dgather_ret<uint4> { typedef uint4 type; };
345
+
346
+ template <> struct __nv_tex2dgather_ret<float> { typedef float4 type; };
347
+ template <> struct __nv_tex2dgather_ret<float1> { typedef float4 type; };
348
+ template <> struct __nv_tex2dgather_ret<float2> { typedef float4 type; };
349
+ template <> struct __nv_tex2dgather_ret<float3> { typedef float4 type; };
350
+ template <> struct __nv_tex2dgather_ret<float4> { typedef float4 type; };
351
+
352
+ template <typename T>
353
+ static __device__ __forceinline__ typename __nv_tex2dgather_ret<T>::type tex2Dgather(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, int comp=0)
354
+ {
355
+ #ifdef __CUDA_ARCH__
356
+ T type_dummy;
357
+ typename __nv_tex2dgather_ret<T>::type retval;
358
+ __nv_tex_surf_handler("__tex2Dgather_v2", &type_dummy, &retval, t, x, y, comp);
359
+ return retval;
360
+ #endif /* __CUDA_ARCH__ */
361
+ }
362
+
363
+
364
+ template<typename T> struct __nv_tex2dgather_rmnf_ret { };
365
+ template<> struct __nv_tex2dgather_rmnf_ret<char> { typedef float4 type; };
366
+ template<> struct __nv_tex2dgather_rmnf_ret<signed char> { typedef float4 type; };
367
+ template<> struct __nv_tex2dgather_rmnf_ret<unsigned char> { typedef float4 type; };
368
+ template<> struct __nv_tex2dgather_rmnf_ret<char1> { typedef float4 type; };
369
+ template<> struct __nv_tex2dgather_rmnf_ret<uchar1> { typedef float4 type; };
370
+ template<> struct __nv_tex2dgather_rmnf_ret<char2> { typedef float4 type; };
371
+ template<> struct __nv_tex2dgather_rmnf_ret<uchar2> { typedef float4 type; };
372
+ template<> struct __nv_tex2dgather_rmnf_ret<char3> { typedef float4 type; };
373
+ template<> struct __nv_tex2dgather_rmnf_ret<uchar3> { typedef float4 type; };
374
+ template<> struct __nv_tex2dgather_rmnf_ret<char4> { typedef float4 type; };
375
+ template<> struct __nv_tex2dgather_rmnf_ret<uchar4> { typedef float4 type; };
376
+ template<> struct __nv_tex2dgather_rmnf_ret<signed short> { typedef float4 type; };
377
+ template<> struct __nv_tex2dgather_rmnf_ret<unsigned short> { typedef float4 type; };
378
+ template<> struct __nv_tex2dgather_rmnf_ret<short1> { typedef float4 type; };
379
+ template<> struct __nv_tex2dgather_rmnf_ret<ushort1> { typedef float4 type; };
380
+ template<> struct __nv_tex2dgather_rmnf_ret<short2> { typedef float4 type; };
381
+ template<> struct __nv_tex2dgather_rmnf_ret<ushort2> { typedef float4 type; };
382
+ template<> struct __nv_tex2dgather_rmnf_ret<short3> { typedef float4 type; };
383
+ template<> struct __nv_tex2dgather_rmnf_ret<ushort3> { typedef float4 type; };
384
+ template<> struct __nv_tex2dgather_rmnf_ret<short4> { typedef float4 type; };
385
+ template<> struct __nv_tex2dgather_rmnf_ret<ushort4> { typedef float4 type; };
386
+
387
+ template <typename T>
388
+ static __device__ __forceinline__ typename __nv_tex2dgather_rmnf_ret<T>::type tex2Dgather(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, int comp = 0)
389
+ {
390
+ #ifdef __CUDA_ARCH__
391
+ T type_dummy;
392
+ typename __nv_tex2dgather_rmnf_ret<T>::type retval;
393
+ __nv_tex_surf_handler("__tex2Dgather_rmnf_v2", &type_dummy, &retval, t, x, y, comp);
394
+ return retval;
395
+ #endif /* __CUDA_ARCH__ */
396
+ }
397
+
398
+
399
+ // tex1DLod
400
+ template <typename T>
401
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLod(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x, float level)
402
+ {
403
+ #ifdef __CUDA_ARCH__
404
+ typename __nv_tex_rmet_ret<T>::type temp;
405
+ __nv_tex_surf_handler("__tex1DLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, level);
406
+ return temp;
407
+ #endif
408
+ }
409
+
410
+ template <typename T>
411
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLod(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x, float level)
412
+ {
413
+ #ifdef __CUDA_ARCH__
414
+ T type_dummy;
415
+ typename __nv_tex_rmnf_ret<T>::type retval;
416
+ __nv_tex_surf_handler("__tex1DLod_rmnf_v2", &type_dummy, &retval, t, x, level);
417
+ return retval;
418
+ #endif /* __CUDA_ARCH__ */
419
+ }
420
+
421
+ // tex2DLod
422
+ template <typename T>
423
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLod(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, float level)
424
+ {
425
+ #ifdef __CUDA_ARCH__
426
+ typename __nv_tex_rmet_ret<T>::type temp;
427
+ __nv_tex_surf_handler("__tex2DLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, level);
428
+ return temp;
429
+ #endif
430
+ }
431
+
432
+ template <typename T>
433
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLod(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, float level)
434
+ {
435
+ #ifdef __CUDA_ARCH__
436
+ T type_dummy;
437
+ typename __nv_tex_rmnf_ret<T>::type retval;
438
+ __nv_tex_surf_handler("__tex2DLod_rmnf_v2", &type_dummy, &retval, t, x, y, level);
439
+ return retval;
440
+ #endif /* __CUDA_ARCH__ */
441
+ }
442
+
443
+ // tex1DLayeredLod
444
+ template <typename T>
445
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayeredLod(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer, float level)
446
+ {
447
+ #ifdef __CUDA_ARCH__
448
+ typename __nv_tex_rmet_ret<T>::type temp;
449
+ __nv_tex_surf_handler("__tex1DLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, layer, level);
450
+ return temp;
451
+ #endif
452
+ }
453
+
454
+ template <typename T>
455
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayeredLod(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer, float level)
456
+ {
457
+ #ifdef __CUDA_ARCH__
458
+ T type_dummy;
459
+ typename __nv_tex_rmnf_ret<T>::type retval;
460
+ __nv_tex_surf_handler("__tex1DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, layer, level);
461
+ return retval;
462
+ #endif /* __CUDA_ARCH__ */
463
+ }
464
+
465
+ // tex2DLayeredLod
466
+ template <typename T>
467
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayeredLod(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer, float level)
468
+ {
469
+ #ifdef __CUDA_ARCH__
470
+ typename __nv_tex_rmet_ret<T>::type temp;
471
+ __nv_tex_surf_handler("__tex2DLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, layer, level);
472
+ return temp;
473
+ #endif
474
+ }
475
+
476
+ template <typename T>
477
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayeredLod(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer, float level)
478
+ {
479
+ #ifdef __CUDA_ARCH__
480
+ T type_dummy;
481
+ typename __nv_tex_rmnf_ret<T>::type retval;
482
+ __nv_tex_surf_handler("__tex2DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, layer, level);
483
+ return retval;
484
+ #endif /* __CUDA_ARCH__ */
485
+ }
486
+
487
+ // tex3DLod
488
+ template <typename T>
489
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3DLod(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z, float level)
490
+ {
491
+ #ifdef __CUDA_ARCH__
492
+ typename __nv_tex_rmet_ret<T>::type temp;
493
+ __nv_tex_surf_handler("__tex3DLod_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, level);
494
+ return temp;
495
+ #endif
496
+ }
497
+
498
+ template <typename T>
499
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3DLod(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z, float level)
500
+ {
501
+ #ifdef __CUDA_ARCH__
502
+ T type_dummy;
503
+ typename __nv_tex_rmnf_ret<T>::type retval;
504
+ __nv_tex_surf_handler("__tex3DLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level);
505
+ return retval;
506
+ #endif /* __CUDA_ARCH__ */
507
+ }
508
+
509
+ // texCubemapLod
510
+ template <typename T>
511
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLod(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z, float level)
512
+ {
513
+ #ifdef __CUDA_ARCH__
514
+ typename __nv_tex_rmet_ret<T>::type temp;
515
+ __nv_tex_surf_handler("__texCubemapLod_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, level);
516
+ return temp;
517
+ #endif
518
+ }
519
+
520
+ template <typename T>
521
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLod(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z, float level)
522
+ {
523
+ #ifdef __CUDA_ARCH__
524
+ T type_dummy;
525
+ typename __nv_tex_rmnf_ret<T>::type retval;
526
+ __nv_tex_surf_handler("__texCubemapLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level);
527
+ return retval;
528
+ #endif /* __CUDA_ARCH__ */
529
+ }
530
+
531
+
532
+ // texCubemapLayered
533
+ template <typename T>
534
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayered(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer)
535
+ {
536
+ #ifdef __CUDA_ARCH__
537
+ typename __nv_tex_rmet_ret<T>::type temp;
538
+ __nv_tex_surf_handler("__texCubemapLayered_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer);
539
+ return temp;
540
+ #endif
541
+ }
542
+
543
+ template <typename T>
544
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayered(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer)
545
+ {
546
+ #ifdef __CUDA_ARCH__
547
+ T type_dummy;
548
+ typename __nv_tex_rmnf_ret<T>::type retval;
549
+ __nv_tex_surf_handler("__texCubemapLayered_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer);
550
+ return retval;
551
+ #endif /* __CUDA_ARCH__ */
552
+ }
553
+
554
+
555
+ // texCubemapLayeredLod
556
+ template <typename T>
557
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayeredLod(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer, float level)
558
+ {
559
+ #ifdef __CUDA_ARCH__
560
+ typename __nv_tex_rmet_ret<T>::type temp;
561
+ __nv_tex_surf_handler("__texCubemapLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer, level);
562
+ return temp;
563
+ #endif
564
+ }
565
+
566
+ template <typename T>
567
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayeredLod(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer, float level)
568
+ {
569
+ #ifdef __CUDA_ARCH__
570
+ T type_dummy;
571
+ typename __nv_tex_rmnf_ret<T>::type retval;
572
+ __nv_tex_surf_handler("__texCubemapLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer, level);
573
+ return retval;
574
+ #endif /* __CUDA_ARCH__ */
575
+ }
576
+
577
+
578
+ // texCubemapGrad
579
+ template <typename T>
580
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapGrad(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z, float4 dPdx, float4 dPdy)
581
+ {
582
+ #ifdef __CUDA_ARCH__
583
+ typename __nv_tex_rmet_ret<T>::type temp;
584
+ __nv_tex_surf_handler("__texCubemapGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, &dPdx, &dPdy);
585
+ return temp;
586
+ #endif
587
+ }
588
+
589
+ template <typename T>
590
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapGrad(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z, float4 dPdx, float4 dPdy)
591
+ {
592
+ #ifdef __CUDA_ARCH__
593
+ T type_dummy;
594
+ typename __nv_tex_rmnf_ret<T>::type retval;
595
+ __nv_tex_surf_handler("__texCubemapGrad_rmnf_v2", &type_dummy, &retval, t, x, y, z, &dPdx, &dPdy);
596
+ return retval;
597
+ #endif /* __CUDA_ARCH__ */
598
+ }
599
+
600
+
601
+ // texCubemapLayeredGrad
602
+ template <typename T>
603
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayeredGrad(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
604
+ {
605
+ #ifdef __CUDA_ARCH__
606
+ typename __nv_tex_rmet_ret<T>::type temp;
607
+ __nv_tex_surf_handler("__texCubemapLayeredGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer, &dPdx, &dPdy);
608
+ return temp;
609
+ #endif
610
+ }
611
+
612
+ template <typename T>
613
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayeredGrad(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
614
+ {
615
+ #ifdef __CUDA_ARCH__
616
+ T type_dummy;
617
+ typename __nv_tex_rmnf_ret<T>::type retval;
618
+ __nv_tex_surf_handler("__texCubemapLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, layer, &dPdx, &dPdy);
619
+ return retval;
620
+ #endif /* __CUDA_ARCH__ */
621
+ }
622
+
623
+
624
+ // tex1DGrad
625
+ template <typename T>
626
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DGrad(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x, float dPdx, float dPdy)
627
+ {
628
+ #ifdef __CUDA_ARCH__
629
+ typename __nv_tex_rmet_ret<T>::type temp;
630
+ __nv_tex_surf_handler("__tex1DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, dPdx, dPdy);
631
+ return temp;
632
+ #endif
633
+ }
634
+
635
+ template <typename T>
636
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DGrad(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x, float dPdx, float dPdy)
637
+ {
638
+ #ifdef __CUDA_ARCH__
639
+ T type_dummy;
640
+ typename __nv_tex_rmnf_ret<T>::type retval;
641
+ __nv_tex_surf_handler("__tex1DGrad_rmnf_v2", &type_dummy, &retval,t, x,dPdx, dPdy);
642
+ return retval;
643
+ #endif /* __CUDA_ARCH__ */
644
+ }
645
+
646
+
647
+ // tex2DGrad
648
+ template <typename T>
649
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DGrad(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, float2 dPdx, float2 dPdy)
650
+ {
651
+ #ifdef __CUDA_ARCH__
652
+ typename __nv_tex_rmet_ret<T>::type temp;
653
+ __nv_tex_surf_handler("__tex2DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, &dPdx, &dPdy);
654
+ return temp;
655
+ #endif
656
+ }
657
+
658
+ template <typename T>
659
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DGrad(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, float2 dPdx, float2 dPdy)
660
+ {
661
+ #ifdef __CUDA_ARCH__
662
+ T type_dummy;
663
+ typename __nv_tex_rmnf_ret<T>::type retval;
664
+ __nv_tex_surf_handler("__tex2DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, &dPdx, &dPdy);
665
+ return retval;
666
+ #endif /* __CUDA_ARCH__ */
667
+ }
668
+
669
+ // tex1DLayeredGrad
670
+ template <typename T>
671
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayeredGrad(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer, float dPdx, float dPdy)
672
+ {
673
+ #ifdef __CUDA_ARCH__
674
+ typename __nv_tex_rmet_ret<T>::type temp;
675
+ __nv_tex_surf_handler("__tex1DLayeredGrad_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, layer, dPdx, dPdy);
676
+ return temp;
677
+ #endif
678
+ }
679
+
680
+ template <typename T>
681
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayeredGrad(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer, float dPdx, float dPdy)
682
+ {
683
+ #ifdef __CUDA_ARCH__
684
+ T type_dummy;
685
+ typename __nv_tex_rmnf_ret<T>::type retval;
686
+ __nv_tex_surf_handler("__tex1DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, layer, dPdx, dPdy);
687
+ return retval;
688
+ #endif /* __CUDA_ARCH__ */
689
+ }
690
+
691
+ // tex2DLayeredGrad
692
+ template <typename T>
693
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayeredGrad(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
694
+ {
695
+ #ifdef __CUDA_ARCH__
696
+ typename __nv_tex_rmet_ret<T>::type temp;
697
+ __nv_tex_surf_handler("__tex2DLayeredGrad_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, layer, &dPdx, &dPdy);
698
+ return temp;
699
+ #endif
700
+ }
701
+
702
+ template <typename T>
703
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayeredGrad(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
704
+ {
705
+ #ifdef __CUDA_ARCH__
706
+ T type_dummy;
707
+ typename __nv_tex_rmnf_ret<T>::type retval;
708
+ __nv_tex_surf_handler("__tex2DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, layer, &dPdx, &dPdy);
709
+ return retval;
710
+ #endif /* __CUDA_ARCH__ */
711
+ }
712
+
713
+ // tex3DGrad
714
+ template <typename T>
715
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3DGrad(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z, float4 dPdx, float4 dPdy)
716
+ {
717
+ #ifdef __CUDA_ARCH__
718
+ typename __nv_tex_rmet_ret<T>::type temp;
719
+ __nv_tex_surf_handler("__tex3DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, &dPdx, &dPdy);
720
+ return temp;
721
+ #endif
722
+ }
723
+
724
+ template <typename T>
725
+ static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3DGrad(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z, float4 dPdx, float4 dPdy)
726
+ {
727
+ #ifdef __CUDA_ARCH__
728
+ T type_dummy;
729
+ typename __nv_tex_rmnf_ret<T>::type retval;
730
+ __nv_tex_surf_handler("__tex3DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, &dPdx, &dPdy);
731
+ return retval;
732
+ #endif /* __CUDA_ARCH__ */
733
+ }
734
+
735
+ #undef __DEPRECATED__
736
+
737
+ #endif /* __cplusplus && __CUDACC__ */
738
+
739
+ #endif /* !__TEXTURE_FETCH_FUNCTIONS_H__ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (224 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /* cudnn : Neural Networks Library
51
+
52
+ */
53
+
54
+ #if !defined(CUDNN_H_)
55
+ #define CUDNN_H_
56
+
57
+ #include <cuda_runtime.h>
58
+ #include <stdint.h>
59
+
60
+ #include "cudnn_version.h"
61
+ #include "cudnn_ops_infer.h"
62
+ #include "cudnn_ops_train.h"
63
+ #include "cudnn_adv_infer.h"
64
+ #include "cudnn_adv_train.h"
65
+ #include "cudnn_cnn_infer.h"
66
+ #include "cudnn_cnn_train.h"
67
+
68
+ #include "cudnn_backend.h"
69
+
70
+ #if defined(__cplusplus)
71
+ extern "C" {
72
+ #endif
73
+
74
+ #if defined(__cplusplus)
75
+ }
76
+ #endif
77
+
78
+ #endif /* CUDNN_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer.h ADDED
@@ -0,0 +1,658 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*
2
+ * Copyright 2017-2022 NVIDIA Corporation. All rights reserved.
3
+ *
4
+ * NOTICE TO LICENSEE:
5
+ *
6
+ * This source code and/or documentation ("Licensed Deliverables") are
7
+ * subject to NVIDIA intellectual property rights under U.S. and
8
+ * international Copyright laws.
9
+ *
10
+ * These Licensed Deliverables contained herein is PROPRIETARY and
11
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
12
+ * conditions of a form of NVIDIA software license agreement by and
13
+ * between NVIDIA and Licensee ("License Agreement") or electronically
14
+ * accepted by Licensee. Notwithstanding any terms or conditions to
15
+ * the contrary in the License Agreement, reproduction or disclosure
16
+ * of the Licensed Deliverables to any third party without the express
17
+ * written consent of NVIDIA is prohibited.
18
+ *
19
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
20
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
21
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
22
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
23
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
24
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
25
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
26
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
27
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
28
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
29
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
30
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
31
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
32
+ * OF THESE LICENSED DELIVERABLES.
33
+ *
34
+ * U.S. Government End Users. These Licensed Deliverables are a
35
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
36
+ * 1995), consisting of "commercial computer software" and "commercial
37
+ * computer software documentation" as such terms are used in 48
38
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
39
+ * only as a commercial end item. Consistent with 48 C.F.R.12.212 and
40
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
41
+ * U.S. Government End Users acquire the Licensed Deliverables with
42
+ * only those rights set forth herein.
43
+ *
44
+ * Any use of the Licensed Deliverables in individual and commercial
45
+ * software must include, in the user documentation and internal
46
+ * comments to the code, the above Disclaimer and U.S. Government End
47
+ * Users Notice.
48
+ */
49
+
50
+ /* cudnn_adv_infer : cuDNN's advanced and experimental features.
51
+
52
+ */
53
+
54
+ #if !defined(CUDNN_ADV_INFER_H_)
55
+ #define CUDNN_ADV_INFER_H_
56
+
57
+ #include <cuda_runtime.h>
58
+ #include <stdint.h>
59
+
60
+ #include "cudnn_version.h"
61
+ #include "cudnn_ops_infer.h"
62
+
63
+ /* These version numbers are autogenerated, do not edit manually. */
64
+ #define CUDNN_ADV_INFER_MAJOR 8
65
+ #define CUDNN_ADV_INFER_MINOR 7
66
+ #define CUDNN_ADV_INFER_PATCH 0
67
+
68
+ #if (CUDNN_ADV_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_INFER_MINOR != CUDNN_MINOR) || \
69
+ (CUDNN_ADV_INFER_PATCH != CUDNN_PATCHLEVEL)
70
+ #error Version mismatch in cuDNN ADV INFER!!!
71
+ #endif
72
+
73
+ #if defined(__cplusplus)
74
+ extern "C" {
75
+ #endif
76
+
77
+ /* BASIC RNN API */
78
+
79
+ typedef enum {
80
+ CUDNN_FWD_MODE_INFERENCE = 0,
81
+ CUDNN_FWD_MODE_TRAINING = 1,
82
+ } cudnnForwardMode_t;
83
+
84
+ typedef enum {
85
+ CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
86
+ CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
87
+ CUDNN_LSTM = 2, /* LSTM with optional recurrent projection and clipping */
88
+ CUDNN_GRU = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
89
+ } cudnnRNNMode_t;
90
+
91
+ typedef enum {
92
+ CUDNN_RNN_NO_BIAS = 0, /* rnn cell formulas do not use biases */
93
+ CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
94
+ CUDNN_RNN_DOUBLE_BIAS = 2, /* default, rnn cell formulas use two bias vectors */
95
+ CUDNN_RNN_SINGLE_REC_BIAS = 3 /* rnn cell formulas use one recurrent bias in recurrent GEMM */
96
+ } cudnnRNNBiasMode_t;
97
+
98
+ typedef enum {
99
+ CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
100
+ CUDNN_BIDIRECTIONAL = 1, /* output concatination at each layer */
101
+ } cudnnDirectionMode_t;
102
+
103
+ typedef enum {
104
+ CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
105
+ CUDNN_SKIP_INPUT = 1, /* fixed identity matrix in the first layer input GEMM */
106
+ } cudnnRNNInputMode_t;
107
+
108
+ typedef enum {
109
+ CUDNN_RNN_CLIP_NONE = 0, /* disables LSTM cell clipping */
110
+ CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
111
+ } cudnnRNNClipMode_t;
112
+
113
+ typedef enum {
114
+ CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED = 0, /* padded, outer stride from one time-step to the next */
115
+ CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED = 1, /* sequence length sorted and packed as in basic RNN api */
116
+ CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
117
+ } cudnnRNNDataLayout_t;
118
+
119
+ /* Legacy type for backward compatibility */
120
+ typedef unsigned cudnnRNNPaddingMode_t;
121
+
122
+ /* For auxFlags in cudnnSetRNNDescriptor_v8() and cudnnSetRNNPaddingMode() */
123
+ #define CUDNN_RNN_PADDED_IO_DISABLED 0
124
+ #define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
125
+
126
+ struct cudnnRNNStruct;
127
+ typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
128
+
129
+ struct cudnnPersistentRNNPlan;
130
+ typedef struct cudnnPersistentRNNPlan *cudnnPersistentRNNPlan_t;
131
+
132
+ struct cudnnRNNDataStruct;
133
+ typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
134
+
135
+ cudnnStatus_t CUDNNWINAPI
136
+ cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
137
+
138
+ cudnnStatus_t CUDNNWINAPI
139
+ cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
140
+
141
+ cudnnStatus_t CUDNNWINAPI
142
+ cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
143
+ cudnnRNNAlgo_t algo,
144
+ cudnnRNNMode_t cellMode,
145
+ cudnnRNNBiasMode_t biasMode,
146
+ cudnnDirectionMode_t dirMode,
147
+ cudnnRNNInputMode_t inputMode,
148
+ cudnnDataType_t dataType,
149
+ cudnnDataType_t mathPrec,
150
+ cudnnMathType_t mathType,
151
+ int32_t inputSize,
152
+ int32_t hiddenSize,
153
+ int32_t projSize,
154
+ int32_t numLayers,
155
+ cudnnDropoutDescriptor_t dropoutDesc,
156
+ uint32_t auxFlags);
157
+
158
+ cudnnStatus_t CUDNNWINAPI
159
+ cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
160
+ cudnnRNNAlgo_t *algo,
161
+ cudnnRNNMode_t *cellMode,
162
+ cudnnRNNBiasMode_t *biasMode,
163
+ cudnnDirectionMode_t *dirMode,
164
+ cudnnRNNInputMode_t *inputMode,
165
+ cudnnDataType_t *dataType,
166
+ cudnnDataType_t *mathPrec,
167
+ cudnnMathType_t *mathType,
168
+ int32_t *inputSize,
169
+ int32_t *hiddenSize,
170
+ int32_t *projSize,
171
+ int32_t *numLayers,
172
+ cudnnDropoutDescriptor_t *dropoutDesc,
173
+ uint32_t *auxFlags);
174
+
175
+ /*
176
+ * mathPrec in cudnnSetRNNDescriptor_v6() specifies compute precision
177
+ * compute precision is further modified by cudnnSetRNNMatrixMathType()
178
+ * dataType in cudnnGetRNNParamsSize() and wDesc specify weight storage
179
+ * dropout is between RNN layers, not between recurrent steps
180
+ */
181
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
182
+ cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
183
+ cudnnRNNDescriptor_t rnnDesc,
184
+ const int hiddenSize,
185
+ const int numLayers,
186
+ cudnnDropoutDescriptor_t dropoutDesc,
187
+ cudnnRNNInputMode_t inputMode,
188
+ cudnnDirectionMode_t direction,
189
+ cudnnRNNMode_t cellMode,
190
+ cudnnRNNAlgo_t algo,
191
+ cudnnDataType_t mathPrec);
192
+
193
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
194
+ cudnnGetRNNDescriptor_v6(cudnnHandle_t handle,
195
+ cudnnRNNDescriptor_t rnnDesc,
196
+ int *hiddenSize,
197
+ int *numLayers,
198
+ cudnnDropoutDescriptor_t *dropoutDesc,
199
+ cudnnRNNInputMode_t *inputMode,
200
+ cudnnDirectionMode_t *direction,
201
+ cudnnRNNMode_t *cellMode,
202
+ cudnnRNNAlgo_t *algo,
203
+ cudnnDataType_t *mathPrec);
204
+
205
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
206
+ cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType);
207
+
208
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
209
+ cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType);
210
+
211
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
212
+ cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t biasMode);
213
+
214
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
215
+ cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t *biasMode);
216
+
217
+ cudnnStatus_t CUDNNWINAPI
218
+ cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
219
+ cudnnRNNClipMode_t clipMode,
220
+ cudnnNanPropagation_t clipNanOpt,
221
+ double lclip,
222
+ double rclip);
223
+
224
+ cudnnStatus_t CUDNNWINAPI
225
+ cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
226
+ cudnnRNNClipMode_t *clipMode,
227
+ cudnnNanPropagation_t *clipNanOpt,
228
+ double *lclip,
229
+ double *rclip);
230
+
231
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
232
+ cudnnRNNSetClip(cudnnHandle_t handle,
233
+ cudnnRNNDescriptor_t rnnDesc,
234
+ cudnnRNNClipMode_t clipMode,
235
+ cudnnNanPropagation_t clipNanOpt,
236
+ double lclip,
237
+ double rclip);
238
+
239
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
240
+ cudnnRNNGetClip(cudnnHandle_t handle,
241
+ cudnnRNNDescriptor_t rnnDesc,
242
+ cudnnRNNClipMode_t *clipMode,
243
+ cudnnNanPropagation_t *clipNanOpt,
244
+ double *lclip,
245
+ double *rclip);
246
+
247
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
248
+ cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
249
+ cudnnRNNDescriptor_t rnnDesc,
250
+ const int recProjSize,
251
+ const int outProjSize);
252
+
253
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
254
+ cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
255
+ const cudnnRNNDescriptor_t rnnDesc,
256
+ int *recProjSize,
257
+ int *outProjSize);
258
+
259
+ /* Expensive. Creates the plan for the specific settings. */
260
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
261
+ cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
262
+ const int minibatch,
263
+ const cudnnDataType_t dataType,
264
+ cudnnPersistentRNNPlan_t *plan);
265
+
266
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
267
+ cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan);
268
+
269
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
270
+ cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan);
271
+
272
+ cudnnStatus_t CUDNNWINAPI
273
+ cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
274
+
275
+ /* dataType in weight descriptors and input descriptors is used to describe storage */
276
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
277
+ cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
278
+ const cudnnRNNDescriptor_t rnnDesc,
279
+ const int seqLength,
280
+ const cudnnTensorDescriptor_t *xDesc,
281
+ size_t *sizeInBytes);
282
+
283
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
284
+ cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
285
+ const cudnnRNNDescriptor_t rnnDesc,
286
+ const int seqLength,
287
+ const cudnnTensorDescriptor_t *xDesc,
288
+ size_t *sizeInBytes);
289
+
290
+ cudnnStatus_t CUDNNWINAPI
291
+ cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
292
+ cudnnRNNDescriptor_t rnnDesc,
293
+ cudnnForwardMode_t fMode,
294
+ cudnnRNNDataDescriptor_t xDesc,
295
+ size_t *workSpaceSize,
296
+ size_t *reserveSpaceSize);
297
+
298
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
299
+ cudnnGetRNNParamsSize(cudnnHandle_t handle,
300
+ const cudnnRNNDescriptor_t rnnDesc,
301
+ const cudnnTensorDescriptor_t xDesc,
302
+ size_t *sizeInBytes,
303
+ cudnnDataType_t dataType);
304
+
305
+ cudnnStatus_t CUDNNWINAPI
306
+ cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
307
+
308
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
309
+ cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
310
+ const cudnnRNNDescriptor_t rnnDesc,
311
+ const int pseudoLayer,
312
+ const cudnnTensorDescriptor_t xDesc,
313
+ const cudnnFilterDescriptor_t wDesc,
314
+ const void *w,
315
+ const int linLayerID,
316
+ cudnnFilterDescriptor_t linLayerMatDesc,
317
+ void **linLayerMat);
318
+
319
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
320
+ cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
321
+ const cudnnRNNDescriptor_t rnnDesc,
322
+ const int pseudoLayer,
323
+ const cudnnTensorDescriptor_t xDesc,
324
+ const cudnnFilterDescriptor_t wDesc,
325
+ const void *w,
326
+ const int linLayerID,
327
+ cudnnFilterDescriptor_t linLayerBiasDesc,
328
+ void **linLayerBias);
329
+
330
+ cudnnStatus_t CUDNNWINAPI
331
+ cudnnGetRNNWeightParams(cudnnHandle_t handle,
332
+ cudnnRNNDescriptor_t rnnDesc,
333
+ int32_t pseudoLayer,
334
+ size_t weightSpaceSize,
335
+ const void *weightSpace,
336
+ int32_t linLayerID,
337
+ cudnnTensorDescriptor_t mDesc,
338
+ void **mAddr,
339
+ cudnnTensorDescriptor_t bDesc,
340
+ void **bAddr);
341
+
342
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
343
+ cudnnRNNForwardInference(cudnnHandle_t handle,
344
+ const cudnnRNNDescriptor_t rnnDesc,
345
+ const int seqLength,
346
+ const cudnnTensorDescriptor_t *xDesc,
347
+ const void *x,
348
+ const cudnnTensorDescriptor_t hxDesc,
349
+ const void *hx,
350
+ const cudnnTensorDescriptor_t cxDesc,
351
+ const void *cx,
352
+ const cudnnFilterDescriptor_t wDesc,
353
+ const void *w,
354
+ const cudnnTensorDescriptor_t *yDesc,
355
+ void *y,
356
+ const cudnnTensorDescriptor_t hyDesc,
357
+ void *hy,
358
+ const cudnnTensorDescriptor_t cyDesc,
359
+ void *cy,
360
+ void *workSpace,
361
+ size_t workSpaceSizeInBytes);
362
+
363
+ /* RNN EX API */
364
+
365
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
366
+ cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned paddingMode);
367
+
368
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
369
+ cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned *paddingMode);
370
+
371
+ cudnnStatus_t CUDNNWINAPI
372
+ cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
373
+
374
+ cudnnStatus_t CUDNNWINAPI
375
+ cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
376
+
377
+ cudnnStatus_t CUDNNWINAPI
378
+ cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
379
+ cudnnDataType_t dataType,
380
+ cudnnRNNDataLayout_t layout,
381
+ int maxSeqLength,
382
+ int batchSize,
383
+ int vectorSize,
384
+ const int seqLengthArray[], /* length of each sequence in the batch */
385
+ void *paddingFill); /* symbol for filling padding position in output */
386
+
387
+ cudnnStatus_t CUDNNWINAPI
388
+ cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
389
+ cudnnDataType_t *dataType,
390
+ cudnnRNNDataLayout_t *layout,
391
+ int *maxSeqLength,
392
+ int *batchSize,
393
+ int *vectorSize,
394
+ int arrayLengthRequested,
395
+ int seqLengthArray[],
396
+ void *paddingFill);
397
+
398
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
399
+ cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
400
+ const cudnnRNNDescriptor_t rnnDesc,
401
+ const cudnnRNNDataDescriptor_t xDesc,
402
+ const void *x,
403
+ const cudnnTensorDescriptor_t hxDesc,
404
+ const void *hx,
405
+ const cudnnTensorDescriptor_t cxDesc,
406
+ const void *cx,
407
+ const cudnnFilterDescriptor_t wDesc,
408
+ const void *w,
409
+ const cudnnRNNDataDescriptor_t yDesc,
410
+ void *y,
411
+ const cudnnTensorDescriptor_t hyDesc,
412
+ void *hy,
413
+ const cudnnTensorDescriptor_t cyDesc,
414
+ void *cy,
415
+ const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
416
+ const void *keys, /* reserved, should pass NULL */
417
+ const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
418
+ void *cAttn, /* reserved, should pass NULL */
419
+ const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
420
+ void *iAttn, /* reserved, should pass NULL */
421
+ const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
422
+ void *queries, /* reserved, should pass NULL */
423
+ void *workSpace,
424
+ size_t workSpaceSizeInBytes);
425
+
426
+ cudnnStatus_t CUDNNWINAPI
427
+ cudnnRNNForward(cudnnHandle_t handle,
428
+ cudnnRNNDescriptor_t rnnDesc,
429
+ cudnnForwardMode_t fwdMode,
430
+ const int32_t devSeqLengths[],
431
+ cudnnRNNDataDescriptor_t xDesc,
432
+ const void *x,
433
+ cudnnRNNDataDescriptor_t yDesc,
434
+ void *y,
435
+ cudnnTensorDescriptor_t hDesc,
436
+ const void *hx,
437
+ void *hy,
438
+ cudnnTensorDescriptor_t cDesc,
439
+ const void *cx,
440
+ void *cy,
441
+ size_t weightSpaceSize,
442
+ const void *weightSpace,
443
+ size_t workSpaceSize,
444
+ void *workSpace,
445
+ size_t reserveSpaceSize,
446
+ void *reserveSpace);
447
+
448
+ /* RNN FIND API */
449
+
450
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
451
+ cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc);
452
+
453
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
454
+ cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
455
+
456
+ CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
457
+ cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
458
+ const cudnnRNNDescriptor_t rnnDesc,
459
+ const int seqLength,
460
+ const cudnnTensorDescriptor_t *xDesc,
461
+ const void *x,
462
+ const cudnnTensorDescriptor_t hxDesc,
463
+ const void *hx,
464
+ const cudnnTensorDescriptor_t cxDesc,
465
+ const void *cx,
466
+ const cudnnFilterDescriptor_t wDesc,
467
+ const void *w,
468
+ const cudnnTensorDescriptor_t *yDesc,
469
+ void *y,
470
+ const cudnnTensorDescriptor_t hyDesc,
471
+ void *hy,
472
+ const cudnnTensorDescriptor_t cyDesc,
473
+ void *cy,
474
+ const float findIntensity,
475
+ const int requestedAlgoCount,
476
+ int *returnedAlgoCount,
477
+ cudnnAlgorithmPerformance_t *perfResults,
478
+ void *workspace,
479
+ size_t workSpaceSizeInBytes);
480
+
481
+ /* Sequence data descriptor */
482
+
483
+ typedef enum {
484
+ CUDNN_SEQDATA_TIME_DIM = 0, /* index in time */
485
+ CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
486
+ CUDNN_SEQDATA_BEAM_DIM = 2, /* index in beam */
487
+ CUDNN_SEQDATA_VECT_DIM = 3 /* index in vector */
488
+ } cudnnSeqDataAxis_t;
489
+
490
+ struct cudnnSeqDataStruct;
491
+ typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t;
492
+
493
+ #define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
494
+
495
+ cudnnStatus_t CUDNNWINAPI
496
+ cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
497
+
498
+ cudnnStatus_t CUDNNWINAPI
499
+ cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
500
+
501
+ cudnnStatus_t CUDNNWINAPI
502
+ cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
503
+ cudnnDataType_t dataType,
504
+ int nbDims,
505
+ const int dimA[],
506
+ const cudnnSeqDataAxis_t axes[],
507
+ size_t seqLengthArraySize,
508
+ const int seqLengthArray[],
509
+ void *paddingFill);
510
+
511
+ cudnnStatus_t CUDNNWINAPI
512
+ cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
513
+ cudnnDataType_t *dataType,
514
+ int *nbDims,
515
+ int nbDimsRequested,
516
+ int dimA[],
517
+ cudnnSeqDataAxis_t axes[],
518
+ size_t *seqLengthArraySize,
519
+ size_t seqLengthSizeRequested,
520
+ int seqLengthArray[],
521
+ void *paddingFill);
522
+
523
+ /* Multihead Attention */
524
+
525
+ /* Legacy type for backward compatibility */
526
+ typedef unsigned cudnnAttnQueryMap_t;
527
+
528
+ /*
529
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
530
+ * Use the bitwise OR operator to combine several settings listed below. Additional
531
+ * minor options can be added here w/o changing or introducing new API functions.
532
+ */
533
+ #define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0 /* multiple Q-s map to a single (K,V) set when beam size > 1 */
534
+ #define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
535
+ #define CUDNN_ATTN_DISABLE_PROJ_BIASES 0 /* no biases in attention input and output projections */
536
+ #define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1) /* use biases in attention input and output projections */
537
+
538
+ struct cudnnAttnStruct;
539
+ typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t;
540
+
541
+ cudnnStatus_t CUDNNWINAPI
542
+ cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
543
+
544
+ cudnnStatus_t CUDNNWINAPI
545
+ cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
546
+
547
+ cudnnStatus_t CUDNNWINAPI
548
+ cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
549
+ unsigned attnMode,
550
+ int nHeads,
551
+ double smScaler,
552
+ cudnnDataType_t dataType,
553
+ cudnnDataType_t computePrec,
554
+ cudnnMathType_t mathType,
555
+ cudnnDropoutDescriptor_t attnDropoutDesc,
556
+ cudnnDropoutDescriptor_t postDropoutDesc,
557
+ int qSize,
558
+ int kSize,
559
+ int vSize,
560
+ int qProjSize,
561
+ int kProjSize,
562
+ int vProjSize,
563
+ int oProjSize,
564
+ int qoMaxSeqLength,
565
+ int kvMaxSeqLength,
566
+ int maxBatchSize,
567
+ int maxBeamSize);
568
+
569
+ cudnnStatus_t CUDNNWINAPI
570
+ cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
571
+ unsigned *attnMode,
572
+ int *nHeads,
573
+ double *smScaler,
574
+ cudnnDataType_t *dataType,
575
+ cudnnDataType_t *computePrec,
576
+ cudnnMathType_t *mathType,
577
+ cudnnDropoutDescriptor_t *attnDropoutDesc,
578
+ cudnnDropoutDescriptor_t *postDropoutDesc,
579
+ int *qSize,
580
+ int *kSize,
581
+ int *vSize,
582
+ int *qProjSize,
583
+ int *kProjSize,
584
+ int *vProjSize,
585
+ int *oProjSize,
586
+ int *qoMaxSeqLength,
587
+ int *kvMaxSeqLength,
588
+ int *maxBatchSize,
589
+ int *maxBeamSize);
590
+
591
+ cudnnStatus_t CUDNNWINAPI
592
+ cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
593
+ const cudnnAttnDescriptor_t attnDesc,
594
+ size_t *weightSizeInBytes,
595
+ size_t *workSpaceSizeInBytes,
596
+ size_t *reserveSpaceSizeInBytes);
597
+
598
+ typedef enum {
599
+ CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
600
+ CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
601
+ CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
602
+ CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
603
+ CUDNN_MH_ATTN_Q_BIASES = 4, /* input projection bias tensor for 'queries' */
604
+ CUDNN_MH_ATTN_K_BIASES = 5, /* input projection bias for 'keys' */
605
+ CUDNN_MH_ATTN_V_BIASES = 6, /* input projection bias for 'values' */
606
+ CUDNN_MH_ATTN_O_BIASES = 7, /* output projection biases */
607
+ } cudnnMultiHeadAttnWeightKind_t;
608
+
609
+ #define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
610
+
611
+ cudnnStatus_t CUDNNWINAPI
612
+ cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
613
+ const cudnnAttnDescriptor_t attnDesc,
614
+ cudnnMultiHeadAttnWeightKind_t wKind,
615
+ size_t weightSizeInBytes,
616
+ const void *weights,
617
+ cudnnTensorDescriptor_t wDesc,
618
+ void **wAddr);
619
+
620
+ cudnnStatus_t CUDNNWINAPI
621
+ cudnnMultiHeadAttnForward(cudnnHandle_t handle,
622
+ const cudnnAttnDescriptor_t attnDesc,
623
+ int currIdx,
624
+ const int loWinIdx[],
625
+ const int hiWinIdx[],
626
+ const int devSeqLengthsQO[],
627
+ const int devSeqLengthsKV[],
628
+ const cudnnSeqDataDescriptor_t qDesc,
629
+ const void *queries,
630
+ const void *residuals,
631
+ const cudnnSeqDataDescriptor_t kDesc,
632
+ const void *keys,
633
+ const cudnnSeqDataDescriptor_t vDesc,
634
+ const void *values,
635
+ const cudnnSeqDataDescriptor_t oDesc,
636
+ void *out,
637
+ size_t weightSizeInBytes,
638
+ const void *weights,
639
+ size_t workSpaceSizeInBytes,
640
+ void *workSpace,
641
+ size_t reserveSpaceSizeInBytes,
642
+ void *reserveSpace);
643
+
644
+ /*
645
+ * \brief Cross-library version checker.
646
+ * This function is implemented differently in each sub-library. Each sublib
647
+ * checks whether its own version matches that of its dependencies.
648
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
649
+ * CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
650
+ */
651
+ cudnnStatus_t CUDNNWINAPI
652
+ cudnnAdvInferVersionCheck(void);
653
+
654
+ #if defined(__cplusplus)
655
+ }
656
+ #endif
657
+
658
+ #endif /* CUDNN_ADV_INFER_H_ */
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py ADDED
File without changes
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h ADDED
The diff for this file is too large to render. See raw diff
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (220 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (220 Bytes). View file
 
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h ADDED
@@ -0,0 +1,448 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /*************************************************************************
2
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
3
+ *
4
+ * See LICENSE.txt for license information
5
+ ************************************************************************/
6
+
7
+ #ifndef NCCL_H_
8
+ #define NCCL_H_
9
+
10
+ #include <cuda_runtime.h>
11
+ #include <cuda_fp16.h>
12
+ #if CUDART_VERSION >= 11000
13
+ #include <cuda_bf16.h>
14
+ #endif
15
+
16
+ #define NCCL_MAJOR 2
17
+ #define NCCL_MINOR 20
18
+ #define NCCL_PATCH 5
19
+ #define NCCL_SUFFIX ""
20
+
21
+ #define NCCL_VERSION_CODE 22005
22
+ #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
23
+
24
+ #ifdef __cplusplus
25
+ extern "C" {
26
+ #endif
27
+
28
+ #include <limits.h>
29
+ /* Opaque handle to communicator */
30
+ typedef struct ncclComm* ncclComm_t;
31
+ #define NCCL_COMM_NULL NULL
32
+
33
+ #define NCCL_UNIQUE_ID_BYTES 128
34
+ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
35
+
36
+ /* Error type */
37
+ typedef enum { ncclSuccess = 0,
38
+ ncclUnhandledCudaError = 1,
39
+ ncclSystemError = 2,
40
+ ncclInternalError = 3,
41
+ ncclInvalidArgument = 4,
42
+ ncclInvalidUsage = 5,
43
+ ncclRemoteError = 6,
44
+ ncclInProgress = 7,
45
+ ncclNumResults = 8 } ncclResult_t;
46
+
47
+ #define NCCL_CONFIG_UNDEF_INT INT_MIN
48
+ #define NCCL_CONFIG_UNDEF_PTR NULL
49
+ #define NCCL_SPLIT_NOCOLOR -1
50
+
51
+ /* Communicator configuration. Users can assign value to attributes to specify the
52
+ * behavior of a communicator. */
53
+ typedef struct ncclConfig_v21700 {
54
+ /* attributes that users should never touch. */
55
+ size_t size;
56
+ unsigned int magic;
57
+ unsigned int version;
58
+ /* attributes that users are able to customize. */
59
+ int blocking;
60
+ int cgaClusterSize;
61
+ int minCTAs;
62
+ int maxCTAs;
63
+ const char *netName;
64
+ int splitShare;
65
+ } ncclConfig_t;
66
+
67
+ /* Config initializer must be assigned to initialize config structure when it is created.
68
+ * Not initialized config will result in NCCL error. */
69
+ #define NCCL_CONFIG_INITIALIZER { \
70
+ sizeof(ncclConfig_t), /* size */ \
71
+ 0xcafebeef, /* magic */ \
72
+ NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \
73
+ NCCL_CONFIG_UNDEF_INT, /* blocking */ \
74
+ NCCL_CONFIG_UNDEF_INT, /* cgaClusterSize */ \
75
+ NCCL_CONFIG_UNDEF_INT, /* minCTAs */ \
76
+ NCCL_CONFIG_UNDEF_INT, /* maxCTAs */ \
77
+ NCCL_CONFIG_UNDEF_PTR, /* netName */ \
78
+ NCCL_CONFIG_UNDEF_INT /* splitShare */ \
79
+ }
80
+
81
+ /* NCCL malloc and free function for all types of NCCL optimizations
82
+ * (e.g. user buffer registration). The actual allocated size might
83
+ * be larger than requested due to granularity requirement. */
84
+ ncclResult_t ncclMemAlloc(void** ptr, size_t size);
85
+ ncclResult_t pncclMemAlloc(void** ptr, size_t size);
86
+
87
+ ncclResult_t ncclMemFree(void *ptr);
88
+ ncclResult_t pncclMemFree(void *ptr);
89
+
90
+ /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
91
+ * This integer is coded with the MAJOR, MINOR and PATCH level of the
92
+ * NCCL library
93
+ */
94
+ ncclResult_t ncclGetVersion(int *version);
95
+ ncclResult_t pncclGetVersion(int *version);
96
+
97
+ /* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
98
+ * called once and the Id should be distributed to all ranks in the
99
+ * communicator before calling ncclCommInitRank. */
100
+ ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId);
101
+ ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
102
+
103
+ /* Create a new communicator (multi thread/process version) with a configuration
104
+ * set by users. */
105
+ ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
106
+ ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
107
+
108
+ /* Creates a new communicator (multi thread/process version).
109
+ * rank must be between 0 and nranks-1 and unique within a communicator clique.
110
+ * Each rank is associated to a CUDA device, which has to be set before calling
111
+ * ncclCommInitRank.
112
+ * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
113
+ * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
114
+ ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
115
+ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
116
+
117
+ /* Creates a clique of communicators (single process version).
118
+ * This is a convenience function to create a single-process communicator clique.
119
+ * Returns an array of ndev newly initialized communicators in comm.
120
+ * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
121
+ * If devlist is NULL, the first ndev CUDA devices are used.
122
+ * Order of devlist defines user-order of processors within the communicator. */
123
+ ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
124
+ ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
125
+
126
+ /* Finalize a communicator. ncclCommFinalize flushes all issued communications,
127
+ * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
128
+ * when the communicator is globally quiescent and related resources are freed; then,
129
+ * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
130
+ * itself) without blocking. */
131
+ ncclResult_t ncclCommFinalize(ncclComm_t comm);
132
+ ncclResult_t pncclCommFinalize(ncclComm_t comm);
133
+
134
+ /* Frees local resources associated with communicator object. */
135
+ ncclResult_t ncclCommDestroy(ncclComm_t comm);
136
+ ncclResult_t pncclCommDestroy(ncclComm_t comm);
137
+
138
+ /* Frees resources associated with communicator object and aborts any operations
139
+ * that might still be running on the device. */
140
+ ncclResult_t ncclCommAbort(ncclComm_t comm);
141
+ ncclResult_t pncclCommAbort(ncclComm_t comm);
142
+
143
+ /* Creates one or more communicators from an existing one.
144
+ * Ranks with the same color will end up in the same communicator.
145
+ * Within the new communicator, key will be used to order ranks.
146
+ * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
147
+ * and will therefore return a NULL communicator.
148
+ * If config is NULL, the new communicator will inherit the original communicator's
149
+ * configuration*/
150
+ ncclResult_t ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
151
+ ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
152
+
153
+ /* Returns a string for each error code. */
154
+ const char* ncclGetErrorString(ncclResult_t result);
155
+ const char* pncclGetErrorString(ncclResult_t result);
156
+
157
+ /* Returns a human-readable message of the last error that occurred. */
158
+ const char* ncclGetLastError(ncclComm_t comm);
159
+ const char* pncclGetLastError(ncclComm_t comm);
160
+
161
+ /* Checks whether the comm has encountered any asynchronous errors */
162
+ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
163
+ ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
164
+
165
+ /* Gets the number of ranks in the communicator clique. */
166
+ ncclResult_t ncclCommCount(const ncclComm_t comm, int* count);
167
+ ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
168
+
169
+ /* Returns the cuda device number associated with the communicator. */
170
+ ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device);
171
+ ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
172
+
173
+ /* Returns the user-ordered "rank" associated with the communicator. */
174
+ ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank);
175
+ ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
176
+
177
+
178
+ /* Register CUDA buffer for zero-copy operation */
179
+ ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
180
+ ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
181
+
182
+ /* Deregister CUDA buffer */
183
+ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
184
+ ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
185
+
186
+ /* Reduction operation selector */
187
+ typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
188
+ typedef enum { ncclSum = 0,
189
+ ncclProd = 1,
190
+ ncclMax = 2,
191
+ ncclMin = 3,
192
+ ncclAvg = 4,
193
+ /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
194
+ * serves as the least possible value for dynamic ncclRedOp_t's
195
+ * as constructed by ncclRedOpCreate*** functions. */
196
+ ncclNumOps = 5,
197
+ /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
198
+ * It is defined to be the largest signed value (since compilers
199
+ * are permitted to use signed enums) that won't grow
200
+ * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
201
+ * maintain ABI compatibility. */
202
+ ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
203
+ } ncclRedOp_t;
204
+
205
+ /* Data types */
206
+ typedef enum { ncclInt8 = 0, ncclChar = 0,
207
+ ncclUint8 = 1,
208
+ ncclInt32 = 2, ncclInt = 2,
209
+ ncclUint32 = 3,
210
+ ncclInt64 = 4,
211
+ ncclUint64 = 5,
212
+ ncclFloat16 = 6, ncclHalf = 6,
213
+ ncclFloat32 = 7, ncclFloat = 7,
214
+ ncclFloat64 = 8, ncclDouble = 8,
215
+ #if defined(__CUDA_BF16_TYPES_EXIST__)
216
+ ncclBfloat16 = 9,
217
+ ncclNumTypes = 10
218
+ #else
219
+ ncclNumTypes = 9
220
+ #endif
221
+ } ncclDataType_t;
222
+
223
+ /* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
224
+ typedef enum {
225
+ /* ncclScalarDevice: The scalar is in device-visible memory and will be
226
+ * dereferenced while the collective is running. */
227
+ ncclScalarDevice = 0,
228
+
229
+ /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
230
+ * dereferenced before the ncclRedOpCreate***() function returns. */
231
+ ncclScalarHostImmediate = 1
232
+ } ncclScalarResidence_t;
233
+
234
+ /*
235
+ * ncclRedOpCreatePreMulSum
236
+ *
237
+ * Creates a new reduction operator which pre-multiplies input values by a given
238
+ * scalar locally before reducing them with peer values via summation. For use
239
+ * only with collectives launched against *comm* and *datatype*. The
240
+ * *residence* argument indicates how/when the memory pointed to by *scalar*
241
+ * will be dereferenced. Upon return, the newly created operator's handle
242
+ * is stored in *op*.
243
+ */
244
+ ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
245
+ ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
246
+
247
+ /*
248
+ * ncclRedOpDestroy
249
+ *
250
+ * Destroys the reduction operator *op*. The operator must have been created by
251
+ * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
252
+ * destroyed as soon as the last NCCL function which is given that operator returns.
253
+ */
254
+ ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
255
+ ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
256
+
257
+ /*
258
+ * Collective communication operations
259
+ *
260
+ * Collective communication operations must be called separately for each
261
+ * communicator in a communicator clique.
262
+ *
263
+ * They return when operations have been enqueued on the CUDA stream.
264
+ *
265
+ * Since they may perform inter-CPU synchronization, each call has to be done
266
+ * from a different thread or process, or need to use Group Semantics (see
267
+ * below).
268
+ */
269
+
270
+ /*
271
+ * Reduce
272
+ *
273
+ * Reduces data arrays of length count in sendbuff into recvbuff using op
274
+ * operation.
275
+ * recvbuff may be NULL on all calls except for root device.
276
+ * root is the rank (not the CUDA device) where data will reside after the
277
+ * operation is complete.
278
+ *
279
+ * In-place operation will happen if sendbuff == recvbuff.
280
+ */
281
+ ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
282
+ ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
283
+ ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
284
+ ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
285
+
286
+ /*
287
+ * (deprecated) Broadcast (in-place)
288
+ *
289
+ * Copies count values from root to all other devices.
290
+ * root is the rank (not the CUDA device) where data resides before the
291
+ * operation is started.
292
+ *
293
+ * This operation is implicitely in place.
294
+ */
295
+ ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
296
+ ncclComm_t comm, cudaStream_t stream);
297
+ ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
298
+ ncclComm_t comm, cudaStream_t stream);
299
+
300
+ /*
301
+ * Broadcast
302
+ *
303
+ * Copies count values from root to all other devices.
304
+ * root is the rank (not the CUDA device) where data resides before the
305
+ * operation is started.
306
+ *
307
+ * In-place operation will happen if sendbuff == recvbuff.
308
+ */
309
+ ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
310
+ ncclComm_t comm, cudaStream_t stream);
311
+ ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
312
+ ncclComm_t comm, cudaStream_t stream);
313
+
314
+ /*
315
+ * All-Reduce
316
+ *
317
+ * Reduces data arrays of length count in sendbuff using op operation, and
318
+ * leaves identical copies of result on each recvbuff.
319
+ *
320
+ * In-place operation will happen if sendbuff == recvbuff.
321
+ */
322
+ ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
323
+ ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
324
+ ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
325
+ ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
326
+
327
+ /*
328
+ * Reduce-Scatter
329
+ *
330
+ * Reduces data in sendbuff using op operation and leaves reduced result
331
+ * scattered over the devices so that recvbuff on rank i will contain the i-th
332
+ * block of the result.
333
+ * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
334
+ * should have a size of at least nranks*recvcount elements.
335
+ *
336
+ * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
337
+ */
338
+ ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff,
339
+ size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
340
+ cudaStream_t stream);
341
+ ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
342
+ size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
343
+ cudaStream_t stream);
344
+
345
+ /*
346
+ * All-Gather
347
+ *
348
+ * Each device gathers sendcount values from other GPUs into recvbuff,
349
+ * receiving data from rank i at offset i*sendcount.
350
+ * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
351
+ * should have a size of at least nranks*sendcount elements.
352
+ *
353
+ * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
354
+ */
355
+ ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
356
+ ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
357
+ ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
358
+ ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
359
+
360
+ /*
361
+ * Send
362
+ *
363
+ * Send data from sendbuff to rank peer.
364
+ *
365
+ * Rank peer needs to call ncclRecv with the same datatype and the same count from this
366
+ * rank.
367
+ *
368
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
369
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
370
+ * ncclGroupEnd section.
371
+ */
372
+ ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
373
+ ncclComm_t comm, cudaStream_t stream);
374
+ ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
375
+ ncclComm_t comm, cudaStream_t stream);
376
+
377
+ /*
378
+ * Receive
379
+ *
380
+ * Receive data from rank peer into recvbuff.
381
+ *
382
+ * Rank peer needs to call ncclSend with the same datatype and the same count to this
383
+ * rank.
384
+ *
385
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
386
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
387
+ * ncclGroupEnd section.
388
+ */
389
+ ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
390
+ ncclComm_t comm, cudaStream_t stream);
391
+ ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
392
+ ncclComm_t comm, cudaStream_t stream);
393
+
394
+ /*
395
+ * Group semantics
396
+ *
397
+ * When managing multiple GPUs from a single thread, and since NCCL collective
398
+ * calls may perform inter-CPU synchronization, we need to "group" calls for
399
+ * different ranks/devices into a single call.
400
+ *
401
+ * Grouping NCCL calls as being part of the same collective operation is done
402
+ * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
403
+ * collective calls until the ncclGroupEnd call, which will wait for all calls
404
+ * to be complete. Note that for collective communication, ncclGroupEnd only
405
+ * guarantees that the operations are enqueued on the streams, not that
406
+ * the operation is effectively done.
407
+ *
408
+ * Both collective communication and ncclCommInitRank can be used in conjunction
409
+ * of ncclGroupStart/ncclGroupEnd, but not together.
410
+ *
411
+ * Group semantics also allow to fuse multiple operations on the same device
412
+ * to improve performance (for aggregated collective calls), or to permit
413
+ * concurrent progress of multiple send/receive operations.
414
+ */
415
+
416
+ /*
417
+ * Group Start
418
+ *
419
+ * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
420
+ * a single NCCL operation. Nothing will be started on the CUDA stream until
421
+ * ncclGroupEnd.
422
+ */
423
+ ncclResult_t ncclGroupStart();
424
+ ncclResult_t pncclGroupStart();
425
+
426
+ /*
427
+ * Group End
428
+ *
429
+ * End a group call. Start a fused NCCL operation consisting of all calls since
430
+ * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
431
+ * need to be called after ncclGroupEnd.
432
+ */
433
+ ncclResult_t ncclGroupEnd();
434
+ ncclResult_t pncclGroupEnd();
435
+
436
+ /* Register CUDA buffer for zero-copy operation */
437
+ ncclResult_t ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
438
+ ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
439
+
440
+ /* Deregister CUDA buffer */
441
+ ncclResult_t ncclCommDeregister(const ncclComm_t comm, void* handle);
442
+ ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
443
+
444
+ #ifdef __cplusplus
445
+ } // end extern "C"
446
+ #endif
447
+
448
+ #endif // end include guard
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__init__.py ADDED
File without changes
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/_cmd.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: 2015 Eric Larson
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ from __future__ import annotations
5
+
6
+ import logging
7
+ from argparse import ArgumentParser
8
+ from typing import TYPE_CHECKING
9
+
10
+ from pip._vendor import requests
11
+
12
+ from pip._vendor.cachecontrol.adapter import CacheControlAdapter
13
+ from pip._vendor.cachecontrol.cache import DictCache
14
+ from pip._vendor.cachecontrol.controller import logger
15
+
16
+ if TYPE_CHECKING:
17
+ from argparse import Namespace
18
+
19
+ from pip._vendor.cachecontrol.controller import CacheController
20
+
21
+
22
+ def setup_logging() -> None:
23
+ logger.setLevel(logging.DEBUG)
24
+ handler = logging.StreamHandler()
25
+ logger.addHandler(handler)
26
+
27
+
28
+ def get_session() -> requests.Session:
29
+ adapter = CacheControlAdapter(
30
+ DictCache(), cache_etags=True, serializer=None, heuristic=None
31
+ )
32
+ sess = requests.Session()
33
+ sess.mount("http://", adapter)
34
+ sess.mount("https://", adapter)
35
+
36
+ sess.cache_controller = adapter.controller # type: ignore[attr-defined]
37
+ return sess
38
+
39
+
40
+ def get_args() -> Namespace:
41
+ parser = ArgumentParser()
42
+ parser.add_argument("url", help="The URL to try and cache")
43
+ return parser.parse_args()
44
+
45
+
46
+ def main() -> None:
47
+ args = get_args()
48
+ sess = get_session()
49
+
50
+ # Make a request to get a response
51
+ resp = sess.get(args.url)
52
+
53
+ # Turn on logging
54
+ setup_logging()
55
+
56
+ # try setting the cache
57
+ cache_controller: CacheController = (
58
+ sess.cache_controller # type: ignore[attr-defined]
59
+ )
60
+ cache_controller.cache_response(resp.request, resp.raw)
61
+
62
+ # Now try to get it
63
+ if cache_controller.cached_request(resp.request):
64
+ print("Cached!")
65
+ else:
66
+ print("Not cached :(")
67
+
68
+
69
+ if __name__ == "__main__":
70
+ main()
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/adapter.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: 2015 Eric Larson
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ from __future__ import annotations
5
+
6
+ import functools
7
+ import types
8
+ import zlib
9
+ from typing import TYPE_CHECKING, Any, Collection, Mapping
10
+
11
+ from pip._vendor.requests.adapters import HTTPAdapter
12
+
13
+ from pip._vendor.cachecontrol.cache import DictCache
14
+ from pip._vendor.cachecontrol.controller import PERMANENT_REDIRECT_STATUSES, CacheController
15
+ from pip._vendor.cachecontrol.filewrapper import CallbackFileWrapper
16
+
17
+ if TYPE_CHECKING:
18
+ from pip._vendor.requests import PreparedRequest, Response
19
+ from pip._vendor.urllib3 import HTTPResponse
20
+
21
+ from pip._vendor.cachecontrol.cache import BaseCache
22
+ from pip._vendor.cachecontrol.heuristics import BaseHeuristic
23
+ from pip._vendor.cachecontrol.serialize import Serializer
24
+
25
+
26
+ class CacheControlAdapter(HTTPAdapter):
27
+ invalidating_methods = {"PUT", "PATCH", "DELETE"}
28
+
29
+ def __init__(
30
+ self,
31
+ cache: BaseCache | None = None,
32
+ cache_etags: bool = True,
33
+ controller_class: type[CacheController] | None = None,
34
+ serializer: Serializer | None = None,
35
+ heuristic: BaseHeuristic | None = None,
36
+ cacheable_methods: Collection[str] | None = None,
37
+ *args: Any,
38
+ **kw: Any,
39
+ ) -> None:
40
+ super().__init__(*args, **kw)
41
+ self.cache = DictCache() if cache is None else cache
42
+ self.heuristic = heuristic
43
+ self.cacheable_methods = cacheable_methods or ("GET",)
44
+
45
+ controller_factory = controller_class or CacheController
46
+ self.controller = controller_factory(
47
+ self.cache, cache_etags=cache_etags, serializer=serializer
48
+ )
49
+
50
+ def send(
51
+ self,
52
+ request: PreparedRequest,
53
+ stream: bool = False,
54
+ timeout: None | float | tuple[float, float] | tuple[float, None] = None,
55
+ verify: bool | str = True,
56
+ cert: (None | bytes | str | tuple[bytes | str, bytes | str]) = None,
57
+ proxies: Mapping[str, str] | None = None,
58
+ cacheable_methods: Collection[str] | None = None,
59
+ ) -> Response:
60
+ """
61
+ Send a request. Use the request information to see if it
62
+ exists in the cache and cache the response if we need to and can.
63
+ """
64
+ cacheable = cacheable_methods or self.cacheable_methods
65
+ if request.method in cacheable:
66
+ try:
67
+ cached_response = self.controller.cached_request(request)
68
+ except zlib.error:
69
+ cached_response = None
70
+ if cached_response:
71
+ return self.build_response(request, cached_response, from_cache=True)
72
+
73
+ # check for etags and add headers if appropriate
74
+ request.headers.update(self.controller.conditional_headers(request))
75
+
76
+ resp = super().send(request, stream, timeout, verify, cert, proxies)
77
+
78
+ return resp
79
+
80
+ def build_response(
81
+ self,
82
+ request: PreparedRequest,
83
+ response: HTTPResponse,
84
+ from_cache: bool = False,
85
+ cacheable_methods: Collection[str] | None = None,
86
+ ) -> Response:
87
+ """
88
+ Build a response by making a request or using the cache.
89
+
90
+ This will end up calling send and returning a potentially
91
+ cached response
92
+ """
93
+ cacheable = cacheable_methods or self.cacheable_methods
94
+ if not from_cache and request.method in cacheable:
95
+ # Check for any heuristics that might update headers
96
+ # before trying to cache.
97
+ if self.heuristic:
98
+ response = self.heuristic.apply(response)
99
+
100
+ # apply any expiration heuristics
101
+ if response.status == 304:
102
+ # We must have sent an ETag request. This could mean
103
+ # that we've been expired already or that we simply
104
+ # have an etag. In either case, we want to try and
105
+ # update the cache if that is the case.
106
+ cached_response = self.controller.update_cached_response(
107
+ request, response
108
+ )
109
+
110
+ if cached_response is not response:
111
+ from_cache = True
112
+
113
+ # We are done with the server response, read a
114
+ # possible response body (compliant servers will
115
+ # not return one, but we cannot be 100% sure) and
116
+ # release the connection back to the pool.
117
+ response.read(decode_content=False)
118
+ response.release_conn()
119
+
120
+ response = cached_response
121
+
122
+ # We always cache the 301 responses
123
+ elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
124
+ self.controller.cache_response(request, response)
125
+ else:
126
+ # Wrap the response file with a wrapper that will cache the
127
+ # response when the stream has been consumed.
128
+ response._fp = CallbackFileWrapper( # type: ignore[assignment]
129
+ response._fp, # type: ignore[arg-type]
130
+ functools.partial(
131
+ self.controller.cache_response, request, response
132
+ ),
133
+ )
134
+ if response.chunked:
135
+ super_update_chunk_length = response._update_chunk_length
136
+
137
+ def _update_chunk_length(self: HTTPResponse) -> None:
138
+ super_update_chunk_length()
139
+ if self.chunk_left == 0:
140
+ self._fp._close() # type: ignore[union-attr]
141
+
142
+ response._update_chunk_length = types.MethodType( # type: ignore[method-assign]
143
+ _update_chunk_length, response
144
+ )
145
+
146
+ resp: Response = super().build_response(request, response) # type: ignore[no-untyped-call]
147
+
148
+ # See if we should invalidate the cache.
149
+ if request.method in self.invalidating_methods and resp.ok:
150
+ assert request.url is not None
151
+ cache_url = self.controller.cache_url(request.url)
152
+ self.cache.delete(cache_url)
153
+
154
+ # Give the request a from_cache attr to let people use it
155
+ resp.from_cache = from_cache # type: ignore[attr-defined]
156
+
157
+ return resp
158
+
159
+ def close(self) -> None:
160
+ self.cache.close()
161
+ super().close() # type: ignore[no-untyped-call]
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/cache.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: 2015 Eric Larson
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ The cache object API for implementing caches. The default is a thread
7
+ safe in-memory dictionary.
8
+ """
9
+ from __future__ import annotations
10
+
11
+ from threading import Lock
12
+ from typing import IO, TYPE_CHECKING, MutableMapping
13
+
14
+ if TYPE_CHECKING:
15
+ from datetime import datetime
16
+
17
+
18
+ class BaseCache:
19
+ def get(self, key: str) -> bytes | None:
20
+ raise NotImplementedError()
21
+
22
+ def set(
23
+ self, key: str, value: bytes, expires: int | datetime | None = None
24
+ ) -> None:
25
+ raise NotImplementedError()
26
+
27
+ def delete(self, key: str) -> None:
28
+ raise NotImplementedError()
29
+
30
+ def close(self) -> None:
31
+ pass
32
+
33
+
34
+ class DictCache(BaseCache):
35
+ def __init__(self, init_dict: MutableMapping[str, bytes] | None = None) -> None:
36
+ self.lock = Lock()
37
+ self.data = init_dict or {}
38
+
39
+ def get(self, key: str) -> bytes | None:
40
+ return self.data.get(key, None)
41
+
42
+ def set(
43
+ self, key: str, value: bytes, expires: int | datetime | None = None
44
+ ) -> None:
45
+ with self.lock:
46
+ self.data.update({key: value})
47
+
48
+ def delete(self, key: str) -> None:
49
+ with self.lock:
50
+ if key in self.data:
51
+ self.data.pop(key)
52
+
53
+
54
+ class SeparateBodyBaseCache(BaseCache):
55
+ """
56
+ In this variant, the body is not stored mixed in with the metadata, but is
57
+ passed in (as a bytes-like object) in a separate call to ``set_body()``.
58
+
59
+ That is, the expected interaction pattern is::
60
+
61
+ cache.set(key, serialized_metadata)
62
+ cache.set_body(key)
63
+
64
+ Similarly, the body should be loaded separately via ``get_body()``.
65
+ """
66
+
67
+ def set_body(self, key: str, body: bytes) -> None:
68
+ raise NotImplementedError()
69
+
70
+ def get_body(self, key: str) -> IO[bytes] | None:
71
+ """
72
+ Return the body as file-like object.
73
+ """
74
+ raise NotImplementedError()
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/controller.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SPDX-FileCopyrightText: 2015 Eric Larson
2
+ #
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ The httplib2 algorithms ported for use with requests.
7
+ """
8
+ from __future__ import annotations
9
+
10
+ import calendar
11
+ import logging
12
+ import re
13
+ import time
14
+ from email.utils import parsedate_tz
15
+ from typing import TYPE_CHECKING, Collection, Mapping
16
+
17
+ from pip._vendor.requests.structures import CaseInsensitiveDict
18
+
19
+ from pip._vendor.cachecontrol.cache import DictCache, SeparateBodyBaseCache
20
+ from pip._vendor.cachecontrol.serialize import Serializer
21
+
22
+ if TYPE_CHECKING:
23
+ from typing import Literal
24
+
25
+ from pip._vendor.requests import PreparedRequest
26
+ from pip._vendor.urllib3 import HTTPResponse
27
+
28
+ from pip._vendor.cachecontrol.cache import BaseCache
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
33
+
34
+ PERMANENT_REDIRECT_STATUSES = (301, 308)
35
+
36
+
37
+ def parse_uri(uri: str) -> tuple[str, str, str, str, str]:
38
+ """Parses a URI using the regex given in Appendix B of RFC 3986.
39
+
40
+ (scheme, authority, path, query, fragment) = parse_uri(uri)
41
+ """
42
+ match = URI.match(uri)
43
+ assert match is not None
44
+ groups = match.groups()
45
+ return (groups[1], groups[3], groups[4], groups[6], groups[8])
46
+
47
+
48
+ class CacheController:
49
+ """An interface to see if request should cached or not."""
50
+
51
+ def __init__(
52
+ self,
53
+ cache: BaseCache | None = None,
54
+ cache_etags: bool = True,
55
+ serializer: Serializer | None = None,
56
+ status_codes: Collection[int] | None = None,
57
+ ):
58
+ self.cache = DictCache() if cache is None else cache
59
+ self.cache_etags = cache_etags
60
+ self.serializer = serializer or Serializer()
61
+ self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308)
62
+
63
+ @classmethod
64
+ def _urlnorm(cls, uri: str) -> str:
65
+ """Normalize the URL to create a safe key for the cache"""
66
+ (scheme, authority, path, query, fragment) = parse_uri(uri)
67
+ if not scheme or not authority:
68
+ raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
69
+
70
+ scheme = scheme.lower()
71
+ authority = authority.lower()
72
+
73
+ if not path:
74
+ path = "/"
75
+
76
+ # Could do syntax based normalization of the URI before
77
+ # computing the digest. See Section 6.2.2 of Std 66.
78
+ request_uri = query and "?".join([path, query]) or path
79
+ defrag_uri = scheme + "://" + authority + request_uri
80
+
81
+ return defrag_uri
82
+
83
+ @classmethod
84
+ def cache_url(cls, uri: str) -> str:
85
+ return cls._urlnorm(uri)
86
+
87
+ def parse_cache_control(self, headers: Mapping[str, str]) -> dict[str, int | None]:
88
+ known_directives = {
89
+ # https://tools.ietf.org/html/rfc7234#section-5.2
90
+ "max-age": (int, True),
91
+ "max-stale": (int, False),
92
+ "min-fresh": (int, True),
93
+ "no-cache": (None, False),
94
+ "no-store": (None, False),
95
+ "no-transform": (None, False),
96
+ "only-if-cached": (None, False),
97
+ "must-revalidate": (None, False),
98
+ "public": (None, False),
99
+ "private": (None, False),
100
+ "proxy-revalidate": (None, False),
101
+ "s-maxage": (int, True),
102
+ }
103
+
104
+ cc_headers = headers.get("cache-control", headers.get("Cache-Control", ""))
105
+
106
+ retval: dict[str, int | None] = {}
107
+
108
+ for cc_directive in cc_headers.split(","):
109
+ if not cc_directive.strip():
110
+ continue
111
+
112
+ parts = cc_directive.split("=", 1)
113
+ directive = parts[0].strip()
114
+
115
+ try:
116
+ typ, required = known_directives[directive]
117
+ except KeyError:
118
+ logger.debug("Ignoring unknown cache-control directive: %s", directive)
119
+ continue
120
+
121
+ if not typ or not required:
122
+ retval[directive] = None
123
+ if typ:
124
+ try:
125
+ retval[directive] = typ(parts[1].strip())
126
+ except IndexError:
127
+ if required:
128
+ logger.debug(
129
+ "Missing value for cache-control " "directive: %s",
130
+ directive,
131
+ )
132
+ except ValueError:
133
+ logger.debug(
134
+ "Invalid value for cache-control directive " "%s, must be %s",
135
+ directive,
136
+ typ.__name__,
137
+ )
138
+
139
+ return retval
140
+
141
+ def _load_from_cache(self, request: PreparedRequest) -> HTTPResponse | None:
142
+ """
143
+ Load a cached response, or return None if it's not available.
144
+ """
145
+ # We do not support caching of partial content: so if the request contains a
146
+ # Range header then we don't want to load anything from the cache.
147
+ if "Range" in request.headers:
148
+ return None
149
+
150
+ cache_url = request.url
151
+ assert cache_url is not None
152
+ cache_data = self.cache.get(cache_url)
153
+ if cache_data is None:
154
+ logger.debug("No cache entry available")
155
+ return None
156
+
157
+ if isinstance(self.cache, SeparateBodyBaseCache):
158
+ body_file = self.cache.get_body(cache_url)
159
+ else:
160
+ body_file = None
161
+
162
+ result = self.serializer.loads(request, cache_data, body_file)
163
+ if result is None:
164
+ logger.warning("Cache entry deserialization failed, entry ignored")
165
+ return result
166
+
167
+ def cached_request(self, request: PreparedRequest) -> HTTPResponse | Literal[False]:
168
+ """
169
+ Return a cached response if it exists in the cache, otherwise
170
+ return False.
171
+ """
172
+ assert request.url is not None
173
+ cache_url = self.cache_url(request.url)
174
+ logger.debug('Looking up "%s" in the cache', cache_url)
175
+ cc = self.parse_cache_control(request.headers)
176
+
177
+ # Bail out if the request insists on fresh data
178
+ if "no-cache" in cc:
179
+ logger.debug('Request header has "no-cache", cache bypassed')
180
+ return False
181
+
182
+ if "max-age" in cc and cc["max-age"] == 0:
183
+ logger.debug('Request header has "max_age" as 0, cache bypassed')
184
+ return False
185
+
186
+ # Check whether we can load the response from the cache:
187
+ resp = self._load_from_cache(request)
188
+ if not resp:
189
+ return False
190
+
191
+ # If we have a cached permanent redirect, return it immediately. We
192
+ # don't need to test our response for other headers b/c it is
193
+ # intrinsically "cacheable" as it is Permanent.
194
+ #
195
+ # See:
196
+ # https://tools.ietf.org/html/rfc7231#section-6.4.2
197
+ #
198
+ # Client can try to refresh the value by repeating the request
199
+ # with cache busting headers as usual (ie no-cache).
200
+ if int(resp.status) in PERMANENT_REDIRECT_STATUSES:
201
+ msg = (
202
+ "Returning cached permanent redirect response "
203
+ "(ignoring date and etag information)"
204
+ )
205
+ logger.debug(msg)
206
+ return resp
207
+
208
+ headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers)
209
+ if not headers or "date" not in headers:
210
+ if "etag" not in headers:
211
+ # Without date or etag, the cached response can never be used
212
+ # and should be deleted.
213
+ logger.debug("Purging cached response: no date or etag")
214
+ self.cache.delete(cache_url)
215
+ logger.debug("Ignoring cached response: no date")
216
+ return False
217
+
218
+ now = time.time()
219
+ time_tuple = parsedate_tz(headers["date"])
220
+ assert time_tuple is not None
221
+ date = calendar.timegm(time_tuple[:6])
222
+ current_age = max(0, now - date)
223
+ logger.debug("Current age based on date: %i", current_age)
224
+
225
+ # TODO: There is an assumption that the result will be a
226
+ # urllib3 response object. This may not be best since we
227
+ # could probably avoid instantiating or constructing the
228
+ # response until we know we need it.
229
+ resp_cc = self.parse_cache_control(headers)
230
+
231
+ # determine freshness
232
+ freshness_lifetime = 0
233
+
234
+ # Check the max-age pragma in the cache control header
235
+ max_age = resp_cc.get("max-age")
236
+ if max_age is not None:
237
+ freshness_lifetime = max_age
238
+ logger.debug("Freshness lifetime from max-age: %i", freshness_lifetime)
239
+
240
+ # If there isn't a max-age, check for an expires header
241
+ elif "expires" in headers:
242
+ expires = parsedate_tz(headers["expires"])
243
+ if expires is not None:
244
+ expire_time = calendar.timegm(expires[:6]) - date
245
+ freshness_lifetime = max(0, expire_time)
246
+ logger.debug("Freshness lifetime from expires: %i", freshness_lifetime)
247
+
248
+ # Determine if we are setting freshness limit in the
249
+ # request. Note, this overrides what was in the response.
250
+ max_age = cc.get("max-age")
251
+ if max_age is not None:
252
+ freshness_lifetime = max_age
253
+ logger.debug(
254
+ "Freshness lifetime from request max-age: %i", freshness_lifetime
255
+ )
256
+
257
+ min_fresh = cc.get("min-fresh")
258
+ if min_fresh is not None:
259
+ # adjust our current age by our min fresh
260
+ current_age += min_fresh
261
+ logger.debug("Adjusted current age from min-fresh: %i", current_age)
262
+
263
+ # Return entry if it is fresh enough
264
+ if freshness_lifetime > current_age:
265
+ logger.debug('The response is "fresh", returning cached response')
266
+ logger.debug("%i > %i", freshness_lifetime, current_age)
267
+ return resp
268
+
269
+ # we're not fresh. If we don't have an Etag, clear it out
270
+ if "etag" not in headers:
271
+ logger.debug('The cached response is "stale" with no etag, purging')
272
+ self.cache.delete(cache_url)
273
+
274
+ # return the original handler
275
+ return False
276
+
277
+ def conditional_headers(self, request: PreparedRequest) -> dict[str, str]:
278
+ resp = self._load_from_cache(request)
279
+ new_headers = {}
280
+
281
+ if resp:
282
+ headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers)
283
+
284
+ if "etag" in headers:
285
+ new_headers["If-None-Match"] = headers["ETag"]
286
+
287
+ if "last-modified" in headers:
288
+ new_headers["If-Modified-Since"] = headers["Last-Modified"]
289
+
290
+ return new_headers
291
+
292
+ def _cache_set(
293
+ self,
294
+ cache_url: str,
295
+ request: PreparedRequest,
296
+ response: HTTPResponse,
297
+ body: bytes | None = None,
298
+ expires_time: int | None = None,
299
+ ) -> None:
300
+ """
301
+ Store the data in the cache.
302
+ """
303
+ if isinstance(self.cache, SeparateBodyBaseCache):
304
+ # We pass in the body separately; just put a placeholder empty
305
+ # string in the metadata.
306
+ self.cache.set(
307
+ cache_url,
308
+ self.serializer.dumps(request, response, b""),
309
+ expires=expires_time,
310
+ )
311
+ # body is None can happen when, for example, we're only updating
312
+ # headers, as is the case in update_cached_response().
313
+ if body is not None:
314
+ self.cache.set_body(cache_url, body)
315
+ else:
316
+ self.cache.set(
317
+ cache_url,
318
+ self.serializer.dumps(request, response, body),
319
+ expires=expires_time,
320
+ )
321
+
322
+ def cache_response(
323
+ self,
324
+ request: PreparedRequest,
325
+ response: HTTPResponse,
326
+ body: bytes | None = None,
327
+ status_codes: Collection[int] | None = None,
328
+ ) -> None:
329
+ """
330
+ Algorithm for caching requests.
331
+
332
+ This assumes a requests Response object.
333
+ """
334
+ # From httplib2: Don't cache 206's since we aren't going to
335
+ # handle byte range requests
336
+ cacheable_status_codes = status_codes or self.cacheable_status_codes
337
+ if response.status not in cacheable_status_codes:
338
+ logger.debug(
339
+ "Status code %s not in %s", response.status, cacheable_status_codes
340
+ )
341
+ return
342
+
343
+ response_headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(
344
+ response.headers
345
+ )
346
+
347
+ if "date" in response_headers:
348
+ time_tuple = parsedate_tz(response_headers["date"])
349
+ assert time_tuple is not None
350
+ date = calendar.timegm(time_tuple[:6])
351
+ else:
352
+ date = 0
353
+
354
+ # If we've been given a body, our response has a Content-Length, that
355
+ # Content-Length is valid then we can check to see if the body we've
356
+ # been given matches the expected size, and if it doesn't we'll just
357
+ # skip trying to cache it.
358
+ if (
359
+ body is not None
360
+ and "content-length" in response_headers
361
+ and response_headers["content-length"].isdigit()
362
+ and int(response_headers["content-length"]) != len(body)
363
+ ):
364
+ return
365
+
366
+ cc_req = self.parse_cache_control(request.headers)
367
+ cc = self.parse_cache_control(response_headers)
368
+
369
+ assert request.url is not None
370
+ cache_url = self.cache_url(request.url)
371
+ logger.debug('Updating cache with response from "%s"', cache_url)
372
+
373
+ # Delete it from the cache if we happen to have it stored there
374
+ no_store = False
375
+ if "no-store" in cc:
376
+ no_store = True
377
+ logger.debug('Response header has "no-store"')
378
+ if "no-store" in cc_req:
379
+ no_store = True
380
+ logger.debug('Request header has "no-store"')
381
+ if no_store and self.cache.get(cache_url):
382
+ logger.debug('Purging existing cache entry to honor "no-store"')
383
+ self.cache.delete(cache_url)
384
+ if no_store:
385
+ return
386
+
387
+ # https://tools.ietf.org/html/rfc7234#section-4.1:
388
+ # A Vary header field-value of "*" always fails to match.
389
+ # Storing such a response leads to a deserialization warning
390
+ # during cache lookup and is not allowed to ever be served,
391
+ # so storing it can be avoided.
392
+ if "*" in response_headers.get("vary", ""):
393
+ logger.debug('Response header has "Vary: *"')
394
+ return
395
+
396
+ # If we've been given an etag, then keep the response
397
+ if self.cache_etags and "etag" in response_headers:
398
+ expires_time = 0
399
+ if response_headers.get("expires"):
400
+ expires = parsedate_tz(response_headers["expires"])
401
+ if expires is not None:
402
+ expires_time = calendar.timegm(expires[:6]) - date
403
+
404
+ expires_time = max(expires_time, 14 * 86400)
405
+
406
+ logger.debug(f"etag object cached for {expires_time} seconds")
407
+ logger.debug("Caching due to etag")
408
+ self._cache_set(cache_url, request, response, body, expires_time)
409
+
410
+ # Add to the cache any permanent redirects. We do this before looking
411
+ # that the Date headers.
412
+ elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
413
+ logger.debug("Caching permanent redirect")
414
+ self._cache_set(cache_url, request, response, b"")
415
+
416
+ # Add to the cache if the response headers demand it. If there
417
+ # is no date header then we can't do anything about expiring
418
+ # the cache.
419
+ elif "date" in response_headers:
420
+ time_tuple = parsedate_tz(response_headers["date"])
421
+ assert time_tuple is not None
422
+ date = calendar.timegm(time_tuple[:6])
423
+ # cache when there is a max-age > 0
424
+ max_age = cc.get("max-age")
425
+ if max_age is not None and max_age > 0:
426
+ logger.debug("Caching b/c date exists and max-age > 0")
427
+ expires_time = max_age
428
+ self._cache_set(
429
+ cache_url,
430
+ request,
431
+ response,
432
+ body,
433
+ expires_time,
434
+ )
435
+
436
+ # If the request can expire, it means we should cache it
437
+ # in the meantime.
438
+ elif "expires" in response_headers:
439
+ if response_headers["expires"]:
440
+ expires = parsedate_tz(response_headers["expires"])
441
+ if expires is not None:
442
+ expires_time = calendar.timegm(expires[:6]) - date
443
+ else:
444
+ expires_time = None
445
+
446
+ logger.debug(
447
+ "Caching b/c of expires header. expires in {} seconds".format(
448
+ expires_time
449
+ )
450
+ )
451
+ self._cache_set(
452
+ cache_url,
453
+ request,
454
+ response,
455
+ body,
456
+ expires_time,
457
+ )
458
+
459
+ def update_cached_response(
460
+ self, request: PreparedRequest, response: HTTPResponse
461
+ ) -> HTTPResponse:
462
+ """On a 304 we will get a new set of headers that we want to
463
+ update our cached value with, assuming we have one.
464
+
465
+ This should only ever be called when we've sent an ETag and
466
+ gotten a 304 as the response.
467
+ """
468
+ assert request.url is not None
469
+ cache_url = self.cache_url(request.url)
470
+ cached_response = self._load_from_cache(request)
471
+
472
+ if not cached_response:
473
+ # we didn't have a cached response
474
+ return response
475
+
476
+ # Lets update our headers with the headers from the new request:
477
+ # http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1
478
+ #
479
+ # The server isn't supposed to send headers that would make
480
+ # the cached body invalid. But... just in case, we'll be sure
481
+ # to strip out ones we know that might be problmatic due to
482
+ # typical assumptions.
483
+ excluded_headers = ["content-length"]
484
+
485
+ cached_response.headers.update(
486
+ {
487
+ k: v
488
+ for k, v in response.headers.items()
489
+ if k.lower() not in excluded_headers
490
+ }
491
+ )
492
+
493
+ # we want a 200 b/c we have content via the cache
494
+ cached_response.status = 200
495
+
496
+ # update our cache
497
+ self._cache_set(cache_url, request, cached_response)
498
+
499
+ return cached_response