koichi12 commited on Feb 12, 2025

Commit

bdf6bb8

verified ·

1 Parent(s): a249ee4

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h +891 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h +693 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h +824 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h +758 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h +348 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h +227 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h +350 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h +94 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h +1958 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h +224 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h +109 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp +224 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h +65 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h +118 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h +145 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_types.h +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_config.h +65 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/library_types.h +103 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_functions.h +65 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_32_atomic_functions.hpp +134 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_60_atomic_functions.hpp +527 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_functions.h +439 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_fetch_functions.h +739 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h +78 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer.h +658 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h +448 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__init__.py +0 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/_cmd.py +70 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/adapter.py +161 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/cache.py +74 -0
tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/controller.py +499 -0

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc ADDED Viewed

Binary file (13.6 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc ADDED Viewed

Binary file (11.3 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc ADDED Viewed

Binary file (46.4 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc ADDED Viewed

Binary file (25.4 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc ADDED Viewed

Binary file (67.5 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc ADDED Viewed

Binary file (6.66 kB). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h ADDED Viewed

	@@ -0,0 +1,891 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ * This is the public header file for the CUBLAS library, defining the API
+ *
+ * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
+ * on top of the CUDA runtime.
+ */
+#if !defined(CUBLAS_H_)
+#define CUBLAS_H_
+#if defined(CUBLAS_V2_H_)
+#error "It is an error to include both cublas.h and cublas_v2.h"
+#endif
+#include <cuda_runtime.h>
+#ifndef CUBLASWINAPI
+#ifdef _WIN32
+#define CUBLASWINAPI __stdcall
+#else
+#define CUBLASWINAPI
+#endif
+#endif
+#undef CUBLASAPI
+#ifdef __CUDACC__
+#define CUBLASAPI __host__
+#else
+#define CUBLASAPI
+#endif
+#include "cublas_api.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* CUBLAS data types */
+#define cublasStatus cublasStatus_t
+cublasStatus CUBLASWINAPI cublasInit(void);
+cublasStatus CUBLASWINAPI cublasShutdown(void);
+cublasStatus CUBLASWINAPI cublasGetError(void);
+cublasStatus CUBLASWINAPI cublasGetVersion(int* version);
+cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void** devicePtr);
+cublasStatus CUBLASWINAPI cublasFree(void* devicePtr);
+cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream);
+/* ---------------- CUBLAS BLAS1 functions ---------------- */
+/* NRM2 */
+float CUBLASWINAPI cublasSnrm2(int n, const float* x, int incx);
+double CUBLASWINAPI cublasDnrm2(int n, const double* x, int incx);
+float CUBLASWINAPI cublasScnrm2(int n, const cuComplex* x, int incx);
+double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* DOT */
+float CUBLASWINAPI cublasSdot(int n, const float* x, int incx, const float* y, int incy);
+double CUBLASWINAPI cublasDdot(int n, const double* x, int incx, const double* y, int incy);
+cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
+cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
+cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
+cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* SCAL */
+void CUBLASWINAPI cublasSscal(int n, float alpha, float* x, int incx);
+void CUBLASWINAPI cublasDscal(int n, double alpha, double* x, int incx);
+void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex* x, int incx);
+void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* AXPY */
+void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float* x, int incx, float* y, int incy);
+void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double* x, int incx, double* y, int incy);
+void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex* x, int incx, cuComplex* y, int incy);
+void CUBLASWINAPI
+cublasZaxpy(int n, cuDoubleComplex alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* COPY */
+void CUBLASWINAPI cublasScopy(int n, const float* x, int incx, float* y, int incy);
+void CUBLASWINAPI cublasDcopy(int n, const double* x, int incx, double* y, int incy);
+void CUBLASWINAPI cublasCcopy(int n, const cuComplex* x, int incx, cuComplex* y, int incy);
+void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* SWAP */
+void CUBLASWINAPI cublasSswap(int n, float* x, int incx, float* y, int incy);
+void CUBLASWINAPI cublasDswap(int n, double* x, int incx, double* y, int incy);
+void CUBLASWINAPI cublasCswap(int n, cuComplex* x, int incx, cuComplex* y, int incy);
+void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* AMAX */
+int CUBLASWINAPI cublasIsamax(int n, const float* x, int incx);
+int CUBLASWINAPI cublasIdamax(int n, const double* x, int incx);
+int CUBLASWINAPI cublasIcamax(int n, const cuComplex* x, int incx);
+int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* AMIN */
+int CUBLASWINAPI cublasIsamin(int n, const float* x, int incx);
+int CUBLASWINAPI cublasIdamin(int n, const double* x, int incx);
+int CUBLASWINAPI cublasIcamin(int n, const cuComplex* x, int incx);
+int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* ASUM */
+float CUBLASWINAPI cublasSasum(int n, const float* x, int incx);
+double CUBLASWINAPI cublasDasum(int n, const double* x, int incx);
+float CUBLASWINAPI cublasScasum(int n, const cuComplex* x, int incx);
+double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* ROT */
+void CUBLASWINAPI cublasSrot(int n, float* x, int incx, float* y, int incy, float sc, float ss);
+void CUBLASWINAPI cublasDrot(int n, double* x, int incx, double* y, int incy, double sc, double ss);
+void CUBLASWINAPI cublasCrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, cuComplex s);
+void CUBLASWINAPI
+cublasZrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double sc, cuDoubleComplex cs);
+void CUBLASWINAPI cublasCsrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, float s);
+void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double c, double s);
+/*------------------------------------------------------------------------*/
+/* ROTG */
+void CUBLASWINAPI cublasSrotg(float* sa, float* sb, float* sc, float* ss);
+void CUBLASWINAPI cublasDrotg(double* sa, double* sb, double* sc, double* ss);
+void CUBLASWINAPI cublasCrotg(cuComplex* ca, cuComplex cb, float* sc, cuComplex* cs);
+void CUBLASWINAPI cublasZrotg(cuDoubleComplex* ca, cuDoubleComplex cb, double* sc, cuDoubleComplex* cs);
+/*------------------------------------------------------------------------*/
+/* ROTM */
+void CUBLASWINAPI cublasSrotm(int n, float* x, int incx, float* y, int incy, const float* sparam);
+void CUBLASWINAPI cublasDrotm(int n, double* x, int incx, double* y, int incy, const double* sparam);
+/*------------------------------------------------------------------------*/
+/* ROTMG */
+void CUBLASWINAPI cublasSrotmg(float* sd1, float* sd2, float* sx1, const float* sy1, float* sparam);
+void CUBLASWINAPI cublasDrotmg(double* sd1, double* sd2, double* sx1, const double* sy1, double* sparam);
+/* --------------- CUBLAS BLAS2 functions  ---------------- */
+/* GEMV */
+void CUBLASWINAPI cublasSgemv(char trans,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* x,
+                              int incx,
+                              float beta,
+                              float* y,
+                              int incy);
+void CUBLASWINAPI cublasDgemv(char trans,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasCgemv(char trans,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZgemv(char trans,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* GBMV */
+void CUBLASWINAPI cublasSgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* x,
+                              int incx,
+                              float beta,
+                              float* y,
+                              int incy);
+void CUBLASWINAPI cublasDgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasCgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* TRMV */
+void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
+void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtrmv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
+void CUBLASWINAPI
+cublasZtrmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TBMV */
+void CUBLASWINAPI
+cublasStbmv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
+void CUBLASWINAPI
+cublasDtbmv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtbmv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZtbmv(
+    char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TPMV */
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
+void CUBLASWINAPI
+cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TRSV */
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
+void CUBLASWINAPI
+cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TPSV */
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
+void CUBLASWINAPI
+cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TBSV */
+void CUBLASWINAPI
+cublasStbsv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
+void CUBLASWINAPI
+cublasDtbsv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtbsv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZtbsv(
+    char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* SYMV/HEMV */
+void CUBLASWINAPI cublasSsymv(
+    char uplo, int n, float alpha, const float* A, int lda, const float* x, int incx, float beta, float* y, int incy);
+void CUBLASWINAPI cublasDsymv(char uplo,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasChemv(char uplo,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZhemv(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* SBMV/HBMV */
+void CUBLASWINAPI cublasSsbmv(char uplo,
+                              int n,
+                              int k,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* x,
+                              int incx,
+                              float beta,
+                              float* y,
+                              int incy);
+void CUBLASWINAPI cublasDsbmv(char uplo,
+                              int n,
+                              int k,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasChbmv(char uplo,
+                              int n,
+                              int k,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZhbmv(char uplo,
+                              int n,
+                              int k,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* SPMV/HPMV */
+void CUBLASWINAPI
+cublasSspmv(char uplo, int n, float alpha, const float* AP, const float* x, int incx, float beta, float* y, int incy);
+void CUBLASWINAPI cublasDspmv(
+    char uplo, int n, double alpha, const double* AP, const double* x, int incx, double beta, double* y, int incy);
+void CUBLASWINAPI cublasChpmv(char uplo,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* AP,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZhpmv(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* AP,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* GER */
+void CUBLASWINAPI
+cublasSger(int m, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
+void CUBLASWINAPI
+cublasDger(int m, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
+void CUBLASWINAPI cublasCgeru(
+    int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
+void CUBLASWINAPI cublasCgerc(
+    int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
+void CUBLASWINAPI cublasZgeru(int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* A,
+                              int lda);
+void CUBLASWINAPI cublasZgerc(int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* A,
+                              int lda);
+/*------------------------------------------------------------------------*/
+/* SYR/HER */
+void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float* x, int incx, float* A, int lda);
+void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double* x, int incx, double* A, int lda);
+void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* A, int lda);
+void CUBLASWINAPI
+cublasZher(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda);
+/*------------------------------------------------------------------------*/
+/* SPR/HPR */
+void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float* x, int incx, float* AP);
+void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double* x, int incx, double* AP);
+void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* AP);
+void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP);
+/*------------------------------------------------------------------------*/
+/* SYR2/HER2 */
+void CUBLASWINAPI
+cublasSsyr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
+void CUBLASWINAPI
+cublasDsyr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
+void CUBLASWINAPI cublasCher2(char uplo,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* x,
+                              int incx,
+                              const cuComplex* y,
+                              int incy,
+                              cuComplex* A,
+                              int lda);
+void CUBLASWINAPI cublasZher2(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* A,
+                              int lda);
+/*------------------------------------------------------------------------*/
+/* SPR2/HPR2 */
+void CUBLASWINAPI
+cublasSspr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* AP);
+void CUBLASWINAPI
+cublasDspr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* AP);
+void CUBLASWINAPI cublasChpr2(
+    char uplo, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP);
+void CUBLASWINAPI cublasZhpr2(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* AP);
+/* ------------------------BLAS3 Functions ------------------------------- */
+/* GEMM */
+void CUBLASWINAPI cublasSgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* B,
+                              int ldb,
+                              float beta,
+                              float* C,
+                              int ldc);
+void CUBLASWINAPI cublasDgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* B,
+                              int ldb,
+                              double beta,
+                              double* C,
+                              int ldc);
+void CUBLASWINAPI cublasCgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* B,
+                              int ldb,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* B,
+                              int ldb,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/* -------------------------------------------------------*/
+/* SYRK */
+void CUBLASWINAPI
+cublasSsyrk(char uplo, char trans, int n, int k, float alpha, const float* A, int lda, float beta, float* C, int ldc);
+void CUBLASWINAPI cublasDsyrk(
+    char uplo, char trans, int n, int k, double alpha, const double* A, int lda, double beta, double* C, int ldc);
+void CUBLASWINAPI cublasCsyrk(char uplo,
+                              char trans,
+                              int n,
+                              int k,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZsyrk(char uplo,
+                              char trans,
+                              int n,
+                              int k,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/* ------------------------------------------------------- */
+/* HERK */
+void CUBLASWINAPI cublasCherk(
+    char uplo, char trans, int n, int k, float alpha, const cuComplex* A, int lda, float beta, cuComplex* C, int ldc);
+void CUBLASWINAPI cublasZherk(char uplo,
+                              char trans,
+                              int n,
+                              int k,
+                              double alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              double beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/* ------------------------------------------------------- */
+/* SYR2K */
+void CUBLASWINAPI cublasSsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               float alpha,
+                               const float* A,
+                               int lda,
+                               const float* B,
+                               int ldb,
+                               float beta,
+                               float* C,
+                               int ldc);
+void CUBLASWINAPI cublasDsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               double alpha,
+                               const double* A,
+                               int lda,
+                               const double* B,
+                               int ldb,
+                               double beta,
+                               double* C,
+                               int ldc);
+void CUBLASWINAPI cublasCsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuComplex alpha,
+                               const cuComplex* A,
+                               int lda,
+                               const cuComplex* B,
+                               int ldb,
+                               cuComplex beta,
+                               cuComplex* C,
+                               int ldc);
+void CUBLASWINAPI cublasZsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuDoubleComplex alpha,
+                               const cuDoubleComplex* A,
+                               int lda,
+                               const cuDoubleComplex* B,
+                               int ldb,
+                               cuDoubleComplex beta,
+                               cuDoubleComplex* C,
+                               int ldc);
+/* ------------------------------------------------------- */
+/* HER2K */
+void CUBLASWINAPI cublasCher2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuComplex alpha,
+                               const cuComplex* A,
+                               int lda,
+                               const cuComplex* B,
+                               int ldb,
+                               float beta,
+                               cuComplex* C,
+                               int ldc);
+void CUBLASWINAPI cublasZher2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuDoubleComplex alpha,
+                               const cuDoubleComplex* A,
+                               int lda,
+                               const cuDoubleComplex* B,
+                               int ldb,
+                               double beta,
+                               cuDoubleComplex* C,
+                               int ldc);
+/*------------------------------------------------------------------------*/
+/* SYMM*/
+void CUBLASWINAPI cublasSsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* B,
+                              int ldb,
+                              float beta,
+                              float* C,
+                              int ldc);
+void CUBLASWINAPI cublasDsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* B,
+                              int ldb,
+                              double beta,
+                              double* C,
+                              int ldc);
+void CUBLASWINAPI cublasCsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* B,
+                              int ldb,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* B,
+                              int ldb,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/*------------------------------------------------------------------------*/
+/* HEMM*/
+void CUBLASWINAPI cublasChemm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* B,
+                              int ldb,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZhemm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* B,
+                              int ldb,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/*------------------------------------------------------------------------*/
+/* TRSM*/
+void CUBLASWINAPI cublasStrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              float* B,
+                              int ldb);
+void CUBLASWINAPI cublasDtrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              double* B,
+                              int ldb);
+void CUBLASWINAPI cublasCtrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              cuComplex* B,
+                              int ldb);
+void CUBLASWINAPI cublasZtrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              cuDoubleComplex* B,
+                              int ldb);
+/*------------------------------------------------------------------------*/
+/* TRMM*/
+void CUBLASWINAPI cublasStrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              float* B,
+                              int ldb);
+void CUBLASWINAPI cublasDtrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              double* B,
+                              int ldb);
+void CUBLASWINAPI cublasCtrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              cuComplex* B,
+                              int ldb);
+void CUBLASWINAPI cublasZtrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              cuDoubleComplex* B,
+                              int ldb);
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#endif /* !defined(CUBLAS_H_) */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h ADDED Viewed

	@@ -0,0 +1,693 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cublasXt : Host API, Out of Core and Multi-GPU BLAS Library
+*/
+#if !defined(CUBLAS_XT_H_)
+#define CUBLAS_XT_H_
+#include "driver_types.h"
+#include "cuComplex.h" /* import complex data type */
+#include "cublas_v2.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+struct cublasXtContext;
+typedef struct cublasXtContext* cublasXtHandle_t;
+cublasStatus_t CUBLASWINAPI cublasXtCreate(cublasXtHandle_t* handle);
+cublasStatus_t CUBLASWINAPI cublasXtDestroy(cublasXtHandle_t handle);
+cublasStatus_t CUBLASWINAPI cublasXtGetNumBoards(int nbDevices, int deviceId[], int* nbBoards);
+cublasStatus_t CUBLASWINAPI cublasXtMaxBoards(int* nbGpuBoards);
+/* This routine selects the Gpus that the user want to use for CUBLAS-XT */
+cublasStatus_t CUBLASWINAPI cublasXtDeviceSelect(cublasXtHandle_t handle, int nbDevices, int deviceId[]);
+/* This routine allows to change the dimension of the tiles ( blockDim x blockDim ) */
+cublasStatus_t CUBLASWINAPI cublasXtSetBlockDim(cublasXtHandle_t handle, int blockDim);
+cublasStatus_t CUBLASWINAPI cublasXtGetBlockDim(cublasXtHandle_t handle, int* blockDim);
+typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t;
+/* This routine allows to CUBLAS-XT to pin the Host memory if it find out that some of the matrix passed
+   are not pinned : Pinning/Unpinning the Host memory is still a costly operation
+   It is better if the user controls the memory on its own (by pinning/unpinning oly when necessary)
+*/
+cublasStatus_t CUBLASWINAPI cublasXtGetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t* mode);
+cublasStatus_t CUBLASWINAPI cublasXtSetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t mode);
+/* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */
+typedef enum {
+  CUBLASXT_FLOAT = 0,
+  CUBLASXT_DOUBLE = 1,
+  CUBLASXT_COMPLEX = 2,
+  CUBLASXT_DOUBLECOMPLEX = 3,
+} cublasXtOpType_t;
+typedef enum {
+  CUBLASXT_GEMM = 0,
+  CUBLASXT_SYRK = 1,
+  CUBLASXT_HERK = 2,
+  CUBLASXT_SYMM = 3,
+  CUBLASXT_HEMM = 4,
+  CUBLASXT_TRSM = 5,
+  CUBLASXT_SYR2K = 6,
+  CUBLASXT_HER2K = 7,
+  CUBLASXT_SPMM = 8,
+  CUBLASXT_SYRKX = 9,
+  CUBLASXT_HERKX = 10,
+  CUBLASXT_TRMM = 11,
+  CUBLASXT_ROUTINE_MAX = 12,
+} cublasXtBlasOp_t;
+/* Currently only 32-bit integer BLAS routines are supported */
+cublasStatus_t CUBLASWINAPI cublasXtSetCpuRoutine(cublasXtHandle_t handle,
+                                                  cublasXtBlasOp_t blasOp,
+                                                  cublasXtOpType_t type,
+                                                  void* blasFunctor);
+/* Specified the percentage of work that should done by the CPU, default is 0 (no work) */
+cublasStatus_t CUBLASWINAPI cublasXtSetCpuRatio(cublasXtHandle_t handle,
+                                                cublasXtBlasOp_t blasOp,
+                                                cublasXtOpType_t type,
+                                                float ratio);
+/* GEMM */
+cublasStatus_t CUBLASWINAPI cublasXtSgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* B,
+                                          size_t ldb,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* B,
+                                          size_t ldb,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* ------------------------------------------------------- */
+/* SYRK */
+cublasStatus_t CUBLASWINAPI cublasXtSsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HERK */
+cublasStatus_t CUBLASWINAPI cublasXtCherk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const float* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const float* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZherk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const double* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const double* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* SYR2K */
+cublasStatus_t CUBLASWINAPI cublasXtSsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const float* alpha,
+                                           const float* A,
+                                           size_t lda,
+                                           const float* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           float* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const double* alpha,
+                                           const double* A,
+                                           size_t lda,
+                                           const double* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           double* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const cuComplex* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const cuDoubleComplex* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HERKX : variant extension of HERK */
+cublasStatus_t CUBLASWINAPI cublasXtCherkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZherkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+/* -------------------------------------------------------------------- */
+/* TRSM */
+cublasStatus_t CUBLASWINAPI cublasXtStrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          float* B,
+                                          size_t ldb);
+cublasStatus_t CUBLASWINAPI cublasXtDtrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          double* B,
+                                          size_t ldb);
+cublasStatus_t CUBLASWINAPI cublasXtCtrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          cuComplex* B,
+                                          size_t ldb);
+cublasStatus_t CUBLASWINAPI cublasXtZtrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          cuDoubleComplex* B,
+                                          size_t ldb);
+/* -------------------------------------------------------------------- */
+/* SYMM : Symmetric Multiply Matrix*/
+cublasStatus_t CUBLASWINAPI cublasXtSsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* B,
+                                          size_t ldb,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* B,
+                                          size_t ldb,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HEMM : Hermitian Matrix Multiply */
+cublasStatus_t CUBLASWINAPI cublasXtChemm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZhemm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* SYRKX : variant extension of SYRK  */
+cublasStatus_t CUBLASWINAPI cublasXtSsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const float* alpha,
+                                           const float* A,
+                                           size_t lda,
+                                           const float* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           float* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const double* alpha,
+                                           const double* A,
+                                           size_t lda,
+                                           const double* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           double* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const cuComplex* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const cuDoubleComplex* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HER2K : variant extension of HERK  */
+cublasStatus_t CUBLASWINAPI cublasXtCher2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZher2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+/* -------------------------------------------------------------------- */
+/* SPMM : Symmetric Packed Multiply Matrix*/
+cublasStatus_t CUBLASWINAPI cublasXtSspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* AP,
+                                          const float* B,
+                                          size_t ldb,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* AP,
+                                          const double* B,
+                                          size_t ldb,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* AP,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* AP,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* TRMM */
+cublasStatus_t CUBLASWINAPI cublasXtStrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* B,
+                                          size_t ldb,
+                                          float* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDtrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* B,
+                                          size_t ldb,
+                                          double* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCtrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZtrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#endif /* !defined(CUBLAS_XT_H_) */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h ADDED Viewed

	@@ -0,0 +1,824 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(NVBLAS_H_)
+#define NVBLAS_H_
+#include "driver_types.h"
+#include "cuComplex.h" /* import complex data type */
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* GEMM */
+void sgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* b,
+            const int* ldb,
+            const float* beta,
+            float* c,
+            const int* ldc);
+void dgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* b,
+            const int* ldb,
+            const double* beta,
+            double* c,
+            const int* ldc);
+void cgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+void zgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+void sgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           const float* b,
+           const int* ldb,
+           const float* beta,
+           float* c,
+           const int* ldc);
+void dgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           const double* b,
+           const int* ldb,
+           const double* beta,
+           double* c,
+           const int* ldc);
+void cgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* b,
+           const int* ldb,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+void zgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* b,
+           const int* ldb,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+/* SYRK */
+void ssyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* beta,
+            float* c,
+            const int* ldc);
+void dsyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* beta,
+            double* c,
+            const int* ldc);
+void csyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+void zsyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+void ssyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           const float* beta,
+           float* c,
+           const int* ldc);
+void dsyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           const double* beta,
+           double* c,
+           const int* ldc);
+void csyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+void zsyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+/* HERK */
+void cherk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const float* beta,
+            cuComplex* c,
+            const int* ldc);
+void zherk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const double* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+void cherk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const float* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const float* beta,
+           cuComplex* c,
+           const int* ldc);
+void zherk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const double* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const double* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+/* TRSM */
+void strsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            float* b,
+            const int* ldb);
+void dtrsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            double* b,
+            const int* ldb);
+void ctrsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            cuComplex* b,
+            const int* ldb);
+void ztrsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            cuDoubleComplex* b,
+            const int* ldb);
+void strsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           float* b,
+           const int* ldb);
+void dtrsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           double* b,
+           const int* ldb);
+void ctrsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           cuComplex* b,
+           const int* ldb);
+void ztrsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           cuDoubleComplex* b,
+           const int* ldb);
+/* SYMM */
+void ssymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* b,
+            const int* ldb,
+            const float* beta,
+            float* c,
+            const int* ldc);
+void dsymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* b,
+            const int* ldb,
+            const double* beta,
+            double* c,
+            const int* ldc);
+void csymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+void zsymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+void ssymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           const float* b,
+           const int* ldb,
+           const float* beta,
+           float* c,
+           const int* ldc);
+void dsymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           const double* b,
+           const int* ldb,
+           const double* beta,
+           double* c,
+           const int* ldc);
+void csymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* b,
+           const int* ldb,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+void zsymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* b,
+           const int* ldb,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+/* HEMM */
+void chemm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+void zhemm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+/* HEMM with no underscore*/
+void chemm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* b,
+           const int* ldb,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+void zhemm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* b,
+           const int* ldb,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+/* SYR2K */
+void ssyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const float* alpha,
+             const float* a,
+             const int* lda,
+             const float* b,
+             const int* ldb,
+             const float* beta,
+             float* c,
+             const int* ldc);
+void dsyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const double* alpha,
+             const double* a,
+             const int* lda,
+             const double* b,
+             const int* ldb,
+             const double* beta,
+             double* c,
+             const int* ldc);
+void csyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuComplex* alpha,
+             const cuComplex* a,
+             const int* lda,
+             const cuComplex* b,
+             const int* ldb,
+             const cuComplex* beta,
+             cuComplex* c,
+             const int* ldc);
+void zsyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuDoubleComplex* alpha,
+             const cuDoubleComplex* a,
+             const int* lda,
+             const cuDoubleComplex* b,
+             const int* ldb,
+             const cuDoubleComplex* beta,
+             cuDoubleComplex* c,
+             const int* ldc);
+/* SYR2K no_underscore*/
+void ssyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* b,
+            const int* ldb,
+            const float* beta,
+            float* c,
+            const int* ldc);
+void dsyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* b,
+            const int* ldb,
+            const double* beta,
+            double* c,
+            const int* ldc);
+void csyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+void zsyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+/* HERK */
+void cher2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuComplex* alpha,
+             const cuComplex* a,
+             const int* lda,
+             const cuComplex* b,
+             const int* ldb,
+             const float* beta,
+             cuComplex* c,
+             const int* ldc);
+void zher2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuDoubleComplex* alpha,
+             const cuDoubleComplex* a,
+             const int* lda,
+             const cuDoubleComplex* b,
+             const int* ldb,
+             const double* beta,
+             cuDoubleComplex* c,
+             const int* ldc);
+/* HER2K with no underscore */
+void cher2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const float* beta,
+            cuComplex* c,
+            const int* ldc);
+void zher2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const double* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+/* TRMM */
+void strmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            float* b,
+            const int* ldb);
+void dtrmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            double* b,
+            const int* ldb);
+void ctrmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            cuComplex* b,
+            const int* ldb);
+void ztrmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            cuDoubleComplex* b,
+            const int* ldb);
+void strmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           float* b,
+           const int* ldb);
+void dtrmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           double* b,
+           const int* ldb);
+void ctrmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           cuComplex* b,
+           const int* ldb);
+void ztrmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           cuDoubleComplex* b,
+           const int* ldb);
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#endif /* !defined(NVBLAS_H_) */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (218 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (218 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (226 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h ADDED Viewed

	@@ -0,0 +1,758 @@

+//
+// NVIDIA_COPYRIGHT_BEGIN
+//
+// Copyright (c) 2014-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+//
+// NVIDIA_COPYRIGHT_END
+//
+#ifndef __NVRTC_H__
+#define __NVRTC_H__
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+#include <stdlib.h>
+/*************************************************************************//**
+ *
+ * \defgroup error Error Handling
+ *
+ * NVRTC defines the following enumeration type and function for API call
+ * error handling.
+ *
+ ****************************************************************************/
+/**
+ * \ingroup error
+ * \brief   The enumerated type nvrtcResult defines API call result codes.
+ *          NVRTC API functions return nvrtcResult to indicate the call
+ *          result.
+ */
+typedef enum {
+  NVRTC_SUCCESS = 0,
+  NVRTC_ERROR_OUT_OF_MEMORY = 1,
+  NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  NVRTC_ERROR_INVALID_INPUT = 3,
+  NVRTC_ERROR_INVALID_PROGRAM = 4,
+  NVRTC_ERROR_INVALID_OPTION = 5,
+  NVRTC_ERROR_COMPILATION = 6,
+  NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  NVRTC_ERROR_INTERNAL_ERROR = 11
+} nvrtcResult;
+/**
+ * \ingroup error
+ * \brief   nvrtcGetErrorString is a helper function that returns a string
+ *          describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to
+ *          \c "NVRTC_SUCCESS".
+ *          For unrecognized enumeration values, it returns
+ *          \c "NVRTC_ERROR unknown".
+ *
+ * \param   [in] result CUDA Runtime Compilation API result code.
+ * \return  Message string for the given #nvrtcResult code.
+ */
+const char *nvrtcGetErrorString(nvrtcResult result);
+/*************************************************************************//**
+ *
+ * \defgroup query General Information Query
+ *
+ * NVRTC defines the following function for general information query.
+ *
+ ****************************************************************************/
+/**
+ * \ingroup query
+ * \brief   nvrtcVersion sets the output parameters \p major and \p minor
+ *          with the CUDA Runtime Compilation version number.
+ *
+ * \param   [out] major CUDA Runtime Compilation major version number.
+ * \param   [out] minor CUDA Runtime Compilation minor version number.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *
+ */
+nvrtcResult nvrtcVersion(int *major, int *minor);
+/**
+ * \ingroup query
+ * \brief   nvrtcGetNumSupportedArchs sets the output parameter \p numArchs
+ *          with the number of architectures supported by NVRTC. This can
+ *          then be used to pass an array to ::nvrtcGetSupportedArchs to
+ *          get the supported architectures.
+ *
+ * \param   [out] numArchs number of supported architectures.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *
+ * see    ::nvrtcGetSupportedArchs
+ */
+nvrtcResult nvrtcGetNumSupportedArchs(int* numArchs);
+/**
+ * \ingroup query
+ * \brief   nvrtcGetSupportedArchs populates the array passed via the output parameter
+ *          \p supportedArchs with the architectures supported by NVRTC. The array is
+ *          sorted in the ascending order. The size of the array to be passed can be
+ *          determined using ::nvrtcGetNumSupportedArchs.
+ *
+ * \param   [out] supportedArchs sorted array of supported architectures.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *
+ * see    ::nvrtcGetNumSupportedArchs
+ */
+nvrtcResult nvrtcGetSupportedArchs(int* supportedArchs);
+/*************************************************************************//**
+ *
+ * \defgroup compilation Compilation
+ *
+ * NVRTC defines the following type and functions for actual compilation.
+ *
+ ****************************************************************************/
+/**
+ * \ingroup compilation
+ * \brief   nvrtcProgram is the unit of compilation, and an opaque handle for
+ *          a program.
+ *
+ * To compile a CUDA program string, an instance of nvrtcProgram must be
+ * created first with ::nvrtcCreateProgram, then compiled with
+ * ::nvrtcCompileProgram.
+ */
+typedef struct _nvrtcProgram *nvrtcProgram;
+/**
+ * \ingroup compilation
+ * \brief   nvrtcCreateProgram creates an instance of nvrtcProgram with the
+ *          given input parameters, and sets the output parameter \p prog with
+ *          it.
+ *
+ * \param   [out] prog         CUDA Runtime Compilation program.
+ * \param   [in]  src          CUDA program source.
+ * \param   [in]  name         CUDA program name.\n
+ *                             \p name can be \c NULL; \c "default_program" is
+ *                             used when \p name is \c NULL or "".
+ * \param   [in]  numHeaders   Number of headers used.\n
+ *                             \p numHeaders must be greater than or equal to 0.
+ * \param   [in]  headers      Sources of the headers.\n
+ *                             \p headers can be \c NULL when \p numHeaders is
+ *                             0.
+ * \param   [in]  includeNames Name of each header by which they can be
+ *                             included in the CUDA program source.\n
+ *                             \p includeNames can be \c NULL when \p numHeaders
+ *                             is 0.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcDestroyProgram
+ */
+nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
+                               const char *src,
+                               const char *name,
+                               int numHeaders,
+                               const char * const *headers,
+                               const char * const *includeNames);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcDestroyProgram destroys the given program.
+ *
+ * \param    [in] prog CUDA Runtime Compilation program.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcCreateProgram
+ */
+nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcCompileProgram compiles the given program.
+ *
+ * \param   [in] prog       CUDA Runtime Compilation program.
+ * \param   [in] numOptions Number of compiler options passed.
+ * \param   [in] options    Compiler options in the form of C string array.\n
+ *                          \p options can be \c NULL when \p numOptions is 0.
+ *
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
+ *
+ * It supports compile options listed in \ref options.
+ */
+nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
+                                int numOptions, const char * const *options);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX
+ *          generated by the previous compilation of \p prog (including the
+ *          trailing \c NULL).
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] ptxSizeRet Size of the generated PTX (including the trailing
+ *                           \c NULL).
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetPTX
+ */
+nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetPTX stores the PTX generated by the previous compilation
+ *          of \p prog in the memory pointed by \p ptx.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] ptx  Compiled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetPTXSize
+ */
+nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetCUBINSize sets \p cubinSizeRet with the size of the cubin
+ *          generated by the previous compilation of \p prog. The value of
+ *          cubinSizeRet is set to 0 if the value specified to \c -arch is a
+ *          virtual architecture instead of an actual architecture.
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] cubinSizeRet Size of the generated cubin.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetCUBIN
+ */
+nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t *cubinSizeRet);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetCUBIN stores the cubin generated by the previous compilation
+ *          of \p prog in the memory pointed by \p cubin. No cubin is available
+ *          if the value specified to \c -arch is a virtual architecture instead
+ *          of an actual architecture.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] cubin  Compiled and assembled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetCUBINSize
+ */
+nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetNVVMSize sets \p nvvmSizeRet with the size of the NVVM
+ *          generated by the previous compilation of \p prog. The value of
+ *          nvvmSizeRet is set to 0 if the program was not compiled with
+ *          \c -dlto.
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] nvvmSizeRet Size of the generated NVVM.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetNVVM
+ */
+nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t *nvvmSizeRet);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetNVVM stores the NVVM generated by the previous compilation
+ *          of \p prog in the memory pointed by \p nvvm.
+ *          The program must have been compiled with -dlto,
+ *          otherwise will return an error.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] nvvm Compiled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetNVVMSize
+ */
+nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char *nvvm);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
+ *          log generated by the previous compilation of \p prog (including the
+ *          trailing \c NULL).
+ *
+ * Note that compilation log may be generated with warnings and informative
+ * messages, even when the compilation of \p prog succeeds.
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] logSizeRet Size of the compilation log
+ *                           (including the trailing \c NULL).
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetProgramLog
+ */
+nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetProgramLog stores the log generated by the previous
+ *          compilation of \p prog in the memory pointed by \p log.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] log  Compilation log.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetProgramLogSize
+ */
+nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcAddNameExpression notes the given name expression
+ *          denoting the address of a __global__ function
+ *          or __device__/__constant__ variable.
+ *
+ * The identical name expression string must be provided on a subsequent
+ * call to nvrtcGetLoweredName to extract the lowered name.
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [in] name_expression constant expression denoting the address of
+ *               a __global__ function or __device__/__constant__ variable.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink
+ *
+ * \see     ::nvrtcGetLoweredName
+ */
+nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog,
+                                   const char * const name_expression);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetLoweredName extracts the lowered (mangled) name
+ *          for a __global__ function or __device__/__constant__ variable,
+ *          and updates *lowered_name to point to it. The memory containing
+ *          the name is released when the NVRTC program is destroyed by
+ *          nvrtcDestroyProgram.
+ *          The identical name expression must have been previously
+ *          provided to nvrtcAddNameExpression.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [in] name_expression constant expression denoting the address of
+ *               a __global__ function or __device__/__constant__ variable.
+ * \param   [out] lowered_name initialized by the function to point to a
+ *               C string containing the lowered (mangled)
+ *               name corresponding to the provided name expression.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink
+ *
+ * \see     ::nvrtcAddNameExpression
+ */
+nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog,
+                                const char *const name_expression,
+                                const char** lowered_name);
+/**
+ * \defgroup options Supported Compile Options
+ *
+ * NVRTC supports the compile options below.
+ * Option names with two preceding dashs (\c --) are long option names and
+ * option names with one preceding dash (\c -) are short option names.
+ * Short option names can be used instead of long option names.
+ * When a compile option takes an argument, an assignment operator (\c =)
+ * is used to separate the compile option argument from the compile option
+ * name, e.g., \c "--gpu-architecture=compute_60".
+ * Alternatively, the compile option name and the argument can be specified in
+ * separate strings without an assignment operator, .e.g,
+ * \c "--gpu-architecture" \c "compute_60".
+ * Single-character short option names, such as \c -D, \c -U, and \c -I, do
+ * not require an assignment operator, and the compile option name and the
+ * argument can be present in the same string with or without spaces between
+ * them.
+ * For instance, \c "-D=<def>", \c "-D<def>", and \c "-D <def>" are all
+ * supported.
+ *
+ * The valid compiler options are:
+ *
+ *   - Compilation targets
+ *     - \c --gpu-architecture=\<arch\> (\c -arch)\n
+ *       Specify the name of the class of GPU architectures for which the
+ *       input must be compiled.\n
+ *       - Valid <c>\<arch\></c>s:
+ *         - \c compute_35
+ *         - \c compute_37
+ *         - \c compute_50
+ *         - \c compute_52
+ *         - \c compute_53
+ *         - \c compute_60
+ *         - \c compute_61
+ *         - \c compute_62
+ *         - \c compute_70
+ *         - \c compute_72
+ *         - \c compute_75
+ *         - \c compute_80
+ *         - \c compute_87
+ *         - \c compute_89
+ *         - \c compute_90
+ *         - \c sm_35
+ *         - \c sm_37
+ *         - \c sm_50
+ *         - \c sm_52
+ *         - \c sm_53
+ *         - \c sm_60
+ *         - \c sm_61
+ *         - \c sm_62
+ *         - \c sm_70
+ *         - \c sm_72
+ *         - \c sm_75
+ *         - \c sm_80
+ *         - \c sm_87
+ *         - \c sm_89
+ *         - \c sm_90
+ *       - Default: \c compute_52
+ *   - Separate compilation / whole-program compilation
+ *     - \c --device-c (\c -dc)\n
+ *       Generate relocatable code that can be linked with other relocatable
+ *       device code.  It is equivalent to --relocatable-device-code=true.
+ *     - \c --device-w (\c -dw)\n
+ *       Generate non-relocatable code.  It is equivalent to
+ *       \c --relocatable-device-code=false.
+ *     - \c --relocatable-device-code={true|false} (\c -rdc)\n
+ *       Enable (disable) the generation of relocatable device code.
+ *       - Default: \c false
+ *     - \c --extensible-whole-program (\c -ewp)\n
+ *       Do extensible whole program compilation of device code.
+ *       - Default: \c false
+ *   - Debugging support
+ *     - \c --device-debug (\c -G)\n
+ *       Generate debug information. If --dopt is not specified,
+ *       then turns off all optimizations.
+ *     - \c --generate-line-info (\c -lineinfo)\n
+ *       Generate line-number information.
+ *   - Code generation
+ *     - \c --dopt on (\c -dopt)\n
+ *     - \c --dopt=on \n
+ *       Enable device code optimization. When specified along with '-G', enables
+ *       limited debug information generation for optimized device code (currently,
+ *       only line number information).
+ *       When '-G' is not specified, '-dopt=on' is implicit.
+ *     - \c --ptxas-options \<options\> (\c -Xptxas)\n
+ *     - \c --ptxas-options=\<options\> \n
+ *       Specify options directly to ptxas, the PTX optimizing assembler.
+ *     - \c --maxrregcount=\<N\> (\c -maxrregcount)\n
+ *       Specify the maximum amount of registers that GPU functions can use.
+ *       Until a function-specific limit, a higher value will generally
+ *       increase the performance of individual GPU threads that execute this
+ *       function.  However, because thread registers are allocated from a
+ *       global register pool on each GPU, a higher value of this option will
+ *       also reduce the maximum thread block size, thereby reducing the amount
+ *       of thread parallelism.  Hence, a good maxrregcount value is the result
+ *       of a trade-off.  If this option is not specified, then no maximum is
+ *       assumed.  Value less than the minimum registers required by ABI will
+ *       be bumped up by the compiler to ABI minimum limit.
+ *     - \c --ftz={true|false} (\c -ftz)\n
+ *       When performing single-precision floating-point operations, flush
+ *       denormal values to zero or preserve denormal values.
+ *       \c --use_fast_math implies \c --ftz=true.
+ *       - Default: \c false
+ *     - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n
+ *       For single-precision floating-point square root, use IEEE
+ *       round-to-nearest mode or use a faster approximation.
+ *       \c --use_fast_math implies \c --prec-sqrt=false.
+ *       - Default: \c true
+ *     - \c --prec-div={true|false} (\c -prec-div)\n
+ *       For single-precision floating-point division and reciprocals, use IEEE
+ *       round-to-nearest mode or use a faster approximation.
+ *       \c --use_fast_math implies \c --prec-div=false.
+ *       - Default: \c true
+ *     - \c --fmad={true|false} (\c -fmad)\n
+ *       Enables (disables) the contraction of floating-point multiplies and
+ *       adds/subtracts into floating-point multiply-add operations (FMAD,
+ *       FFMA, or DFMA).  \c --use_fast_math implies \c --fmad=true.
+ *       - Default: \c true
+ *     - \c --use_fast_math (\c -use_fast_math)\n
+ *       Make use of fast math operations.
+ *       \c --use_fast_math implies \c --ftz=true \c --prec-div=false
+ *       \c --prec-sqrt=false \c --fmad=true.
+ *     - \c --extra-device-vectorization (\c -extra-device-vectorization)\n
+ *       Enables more aggressive device code vectorization in the NVVM optimizer.
+ *     - \c --modify-stack-limit={true|false} (\c -modify-stack-limit)\n
+ *       On Linux, during compilation, use \c setrlimit() to increase stack size
+ *       to maximum allowed. The limit is reset to the previous value at the
+ *       end of compilation.
+ *       Note: \c setrlimit() changes the value for the entire process.
+ *       - Default: \c true
+ *     - \c --dlink-time-opt (\c -dlto)\n
+ *       Generate intermediate code for later link-time optimization.
+ *       It implies \c -rdc=true.
+ *       Note: when this is used the nvrtcGetNVVM API should be used,
+ *       as PTX or Cubin will not be generated.
+ *   - Preprocessing
+ *     - \c --define-macro=\<def\> (\c -D)\n
+ *       \c \<def\> can be either \c \<name\> or \c \<name=definitions\>.
+ *       - \c \<name\> \n
+ *         Predefine \c \<name\> as a macro with definition \c 1.
+ *       - \c \<name\>=\<definition\> \n
+ *         The contents of \c \<definition\> are tokenized and preprocessed
+ *         as if they appeared during translation phase three in a \c \#define
+ *         directive.  In particular, the definition will be truncated by
+ *         embedded new line characters.
+ *     - \c --undefine-macro=\<def\> (\c -U)\n
+ *       Cancel any previous definition of \c \<def\>.
+ *     - \c --include-path=\<dir\> (\c -I)\n
+ *       Add the directory \c \<dir\> to the list of directories to be
+ *       searched for headers.  These paths are searched after the list of
+ *       headers given to ::nvrtcCreateProgram.
+ *     - \c --pre-include=\<header\> (\c -include)\n
+ *       Preinclude \c \<header\> during preprocessing.
+ *     - \c --no-source-include (\c -no-source-include)
+ *       The preprocessor by default adds the directory of each input sources
+ *       to the include path. This option disables this feature and only
+ *       considers the path specified explicitly.
+ *   - Language Dialect
+ *     - \c --std={c++03|c++11|c++14|c++17}
+ *       (\c -std={c++11|c++14|c++17})\n
+ *       Set language dialect to C++03, C++11, C++14 or C++17
+ *     - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n
+ *       Provide builtin definitions of \c std::move and \c std::forward,
+ *       when C++11 language dialect is selected.
+ *       - Default: \c true
+ *     - \c --builtin-initializer-list={true|false}
+ *       (\c -builtin-initializer-list)\n
+ *       Provide builtin definitions of \c std::initializer_list class and
+ *       member functions when C++11 language dialect is selected.
+ *       - Default: \c true
+ *   - Misc.
+ *     - \c --disable-warnings (\c -w)\n
+ *       Inhibit all warning messages.
+ *     - \c --restrict (\c -restrict)\n
+ *       Programmer assertion that all kernel pointer parameters are restrict
+ *       pointers.
+ *     - \c --device-as-default-execution-space
+ *       (\c -default-device)\n
+ *       Treat entities with no execution space annotation as \c __device__
+ *       entities.
+ *     - \c --device-int128 (\c -device-int128)\n
+ *       Allow the \c __int128 type in device code. Also causes the macro \c __CUDACC_RTC_INT128__
+ *       to be defined.
+ *     - \c --optimization-info=\<kind\> (\c -opt-info)\n
+ *       Provide optimization reports for the specified kind of optimization.
+ *       The following kind tags are supported:
+ *         - \c inline : emit a remark when a function is inlined.
+ *     - \c --version-ident={true|false} (\c -dQ)\n
+ *       Embed used compiler's version info into generated PTX/CUBIN
+ *       - Default: \c false
+ *     - \c --display-error-number (\c -err-no)\n
+ *       Display diagnostic number for warning messages. (Default)
+ *     - \c --no-display-error-number (\c -no-err-no)\n
+ *       Disables the display of a diagnostic number for warning messages.
+ *     - \c --diag-error=<error-number>,... (\c -diag-error)\n
+ *       Emit error for specified diagnostic message number(s). Message numbers can be separated by comma.
+ *     - \c --diag-suppress=<error-number>,... (\c -diag-suppress)\n
+ *       Suppress specified diagnostic message number(s). Message numbers can be separated by comma.
+ *     - \c --diag-warn=<error-number>,... (\c -diag-warn)\n
+ *       Emit warning for specified diagnostic message number(s). Message numbers can be separated by comma.
+ *
+ */
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+/* The utility function 'nvrtcGetTypeName' is not available by default. Define
+   the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available.
+*/
+#if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__
+#if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__
+#include <cxxabi.h>
+#include <cstdlib>
+#elif defined(_WIN32)
+#include <Windows.h>
+#include <DbgHelp.h>
+#endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */
+#include <string>
+#include <typeinfo>
+template <typename T> struct __nvrtcGetTypeName_helper_t { };
+/*************************************************************************//**
+ *
+ * \defgroup hosthelper Host Helper
+ *
+ * NVRTC defines the following functions for easier interaction with host code.
+ *
+ ****************************************************************************/
+/**
+ * \ingroup hosthelper
+ * \brief   nvrtcGetTypeName stores the source level name of a type in the given
+ *          std::string location.
+ *
+ * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
+ * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
+ * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
+ * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
+ * otherwise *result is initialized with the extracted name.
+ *
+ * Windows-specific notes:
+ * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(),
+ *   which is not multi-thread safe.
+ * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
+ *
+ * \param   [in] tinfo: reference to object of type std::type_info for a given type.
+ * \param   [in] result: pointer to std::string in which to store the type name.
+ * \return
+ *  - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *  - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
+ *
+ */
+inline nvrtcResult nvrtcGetTypeName(const std::type_info &tinfo, std::string *result)
+{
+#if USE_CXXABI || __clang__ || __GNUC__
+  const char *name = tinfo.name();
+  int status;
+  char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status);
+  if (status == 0) {
+    *result = undecorated_name;
+    free(undecorated_name);
+    return NVRTC_SUCCESS;
+  }
+#elif defined(_WIN32)
+  const char *name = tinfo.raw_name();
+  if (!name || *name != '.') {
+    return NVRTC_ERROR_INTERNAL_ERROR;
+  }
+  char undecorated_name[4096];
+  //name+1 skips over the '.' prefix
+  if(UnDecorateSymbolName(name+1, undecorated_name,
+                          sizeof(undecorated_name) / sizeof(*undecorated_name),
+                           //note: doesn't seem to work correctly without UNDNAME_NO_ARGUMENTS.
+                           UNDNAME_NO_ARGUMENTS | UNDNAME_NAME_ONLY ) ) {
+    *result = undecorated_name;
+    return NVRTC_SUCCESS;
+  }
+#endif  /* USE_CXXABI || __clang__ || __GNUC__ */
+  return NVRTC_ERROR_INTERNAL_ERROR;
+}
+/**
+ * \ingroup hosthelper
+ * \brief   nvrtcGetTypeName stores the source level name of the template type argument
+ *          T in the given std::string location.
+ *
+ * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
+ * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
+ * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
+ * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
+ * otherwise *result is initialized with the extracted name.
+ *
+ * Windows-specific notes:
+ * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(),
+ *   which is not multi-thread safe.
+ * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
+ *
+ * \param   [in] result: pointer to std::string in which to store the type name.
+ * \return
+ *  - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *  - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
+ *
+ */
+template <typename T>
+nvrtcResult nvrtcGetTypeName(std::string *result)
+{
+  nvrtcResult res = nvrtcGetTypeName(typeid(__nvrtcGetTypeName_helper_t<T>),
+                                     result);
+  if (res != NVRTC_SUCCESS)
+    return res;
+  std::string repr = *result;
+  std::size_t idx = repr.find("__nvrtcGetTypeName_helper_t");
+  idx = (idx != std::string::npos) ? repr.find("<", idx) : idx;
+  std::size_t last_idx = repr.find_last_of('>');
+  if (idx == std::string::npos || last_idx == std::string::npos) {
+    return NVRTC_ERROR_INTERNAL_ERROR;
+  }
+  ++idx;
+  *result = repr.substr(idx, last_idx - idx);
+  return NVRTC_SUCCESS;
+}
+#endif  /* NVRTC_GET_TYPE_NAME */
+#endif /* __NVRTC_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (222 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h ADDED Viewed

	@@ -0,0 +1,348 @@

+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(CU_COMPLEX_H_)
+#define CU_COMPLEX_H_
+#if !defined(__CUDACC_RTC__)
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#endif
+#endif
+/* When trying to include C header file in C++ Code extern "C" is required
+ * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
+ * extern "C" cannot be nested
+ * Hence keep the header out of extern "C" block
+ */
+#if !defined(__CUDACC__)
+#include <math.h>       /* import fabsf, sqrt */
+#endif /* !defined(__CUDACC__) */
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+#include "vector_types.h"
+typedef float2 cuFloatComplex;
+__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
+{
+    return x.x;
+}
+__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
+{
+    return x.y;
+}
+__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
+                                                             (float r, float i)
+{
+    cuFloatComplex res;
+    res.x = r;
+    res.y = i;
+    return res;
+}
+__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
+{
+    return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
+}
+__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),
+                                cuCimagf(x) + cuCimagf(y));
+}
+__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+        return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),
+                                    cuCimagf(x) - cuCimagf(y));
+}
+/* This implementation could suffer from intermediate overflow even though
+ * the final result would be in range. However, various implementations do
+ * not guard against this (presumably to avoid losing performance), so we
+ * don't do it either to stay competitive.
+ */
+__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    cuFloatComplex prod;
+    prod = make_cuFloatComplex  ((cuCrealf(x) * cuCrealf(y)) -
+                                 (cuCimagf(x) * cuCimagf(y)),
+                                 (cuCrealf(x) * cuCimagf(y)) +
+                                 (cuCimagf(x) * cuCrealf(y)));
+    return prod;
+}
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Such guarded implementations are usually the default for
+ * complex library implementations, with some also offering an unguarded,
+ * faster version.
+ */
+__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    cuFloatComplex quot;
+    float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
+    float oos = 1.0f / s;
+    float ars = cuCrealf(x) * oos;
+    float ais = cuCimagf(x) * oos;
+    float brs = cuCrealf(y) * oos;
+    float bis = cuCimagf(y) * oos;
+    s = (brs * brs) + (bis * bis);
+    oos = 1.0f / s;
+    quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
+                                ((ais * brs) - (ars * bis)) * oos);
+    return quot;
+}
+/*
+ * We would like to call hypotf(), but it's not available on all platforms.
+ * This discrete implementation guards against intermediate underflow and
+ * overflow by scaling. Otherwise we would lose half the exponent range.
+ * There are various ways of doing guarded computation. For now chose the
+ * simplest and fastest solution, however this may suffer from inaccuracies
+ * if sqrt and division are not IEEE compliant.
+ */
+__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
+{
+    float a = cuCrealf(x);
+    float b = cuCimagf(x);
+    float v, w, t;
+    a = fabsf(a);
+    b = fabsf(b);
+    if (a > b) {
+        v = a;
+        w = b;
+    } else {
+        v = b;
+        w = a;
+    }
+    t = w / v;
+    t = 1.0f + t * t;
+    t = v * sqrtf(t);
+    if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
+        t = v + w;
+    }
+    return t;
+}
+/* Double precision */
+typedef double2 cuDoubleComplex;
+__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
+{
+    return x.x;
+}
+__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)
+{
+    return x.y;
+}
+__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex
+                                                           (double r, double i)
+{
+    cuDoubleComplex res;
+    res.x = r;
+    res.y = i;
+    return res;
+}
+__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
+{
+    return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
+}
+__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),
+                                 cuCimag(x) + cuCimag(y));
+}
+__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),
+                                 cuCimag(x) - cuCimag(y));
+}
+/* This implementation could suffer from intermediate overflow even though
+ * the final result would be in range. However, various implementations do
+ * not guard against this (presumably to avoid losing performance), so we
+ * don't do it either to stay competitive.
+ */
+__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    cuDoubleComplex prod;
+    prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -
+                                 (cuCimag(x) * cuCimag(y)),
+                                 (cuCreal(x) * cuCimag(y)) +
+                                 (cuCimag(x) * cuCreal(y)));
+    return prod;
+}
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Such guarded implementations are usually the default for
+ * complex library implementations, with some also offering an unguarded,
+ * faster version.
+ */
+__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    cuDoubleComplex quot;
+    double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
+    double oos = 1.0 / s;
+    double ars = cuCreal(x) * oos;
+    double ais = cuCimag(x) * oos;
+    double brs = cuCreal(y) * oos;
+    double bis = cuCimag(y) * oos;
+    s = (brs * brs) + (bis * bis);
+    oos = 1.0 / s;
+    quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
+                                 ((ais * brs) - (ars * bis)) * oos);
+    return quot;
+}
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Otherwise we would lose half the exponent range. There are
+ * various ways of doing guarded computation. For now chose the simplest
+ * and fastest solution, however this may suffer from inaccuracies if sqrt
+ * and division are not IEEE compliant.
+ */
+__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
+{
+    double a = cuCreal(x);
+    double b = cuCimag(x);
+    double v, w, t;
+    a = fabs(a);
+    b = fabs(b);
+    if (a > b) {
+        v = a;
+        w = b;
+    } else {
+        v = b;
+        w = a;
+    }
+    t = w / v;
+    t = 1.0 + t * t;
+    t = v * sqrt(t);
+    if ((v == 0.0) ||
+        (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
+        t = v + w;
+    }
+    return t;
+}
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+/* aliases */
+typedef cuFloatComplex cuComplex;
+__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
+                                                                float y)
+{
+    return make_cuFloatComplex (x, y);
+}
+/* float-to-double promotion */
+__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
+                                                      (cuFloatComplex c)
+{
+    return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
+}
+__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
+(cuDoubleComplex c)
+{
+	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
+}
+__host__ __device__ static __inline__  cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
+{
+    float real_res;
+    float imag_res;
+    real_res = (cuCrealf(x) *  cuCrealf(y)) + cuCrealf(d);
+    imag_res = (cuCrealf(x) *  cuCimagf(y)) + cuCimagf(d);
+    real_res = -(cuCimagf(x) * cuCimagf(y))  + real_res;
+    imag_res =  (cuCimagf(x) *  cuCrealf(y)) + imag_res;
+    return make_cuComplex(real_res, imag_res);
+}
+__host__ __device__ static __inline__  cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
+{
+    double real_res;
+    double imag_res;
+    real_res = (cuCreal(x) *  cuCreal(y)) + cuCreal(d);
+    imag_res = (cuCreal(x) *  cuCimag(y)) + cuCimag(d);
+    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;
+    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;
+    return make_cuDoubleComplex(real_res, imag_res);
+}
+#endif /* !defined(CU_COMPLEX_H_) */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h ADDED Viewed

	@@ -0,0 +1,227 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CUDA_AWBARRIER_H_
+# define _CUDA_AWBARRIER_H_
+# include "cuda_awbarrier_primitives.h"
+# if !defined(_CUDA_AWBARRIER_SM_TARGET)
+#  error This file requires compute capability 7.0 or greater.
+# endif
+# if !defined(_CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER)
+#  error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+             -std=c++11 compiler option.
+# endif
+_CUDA_AWBARRIER_BEGIN_NAMESPACE
+class awbarrier {
+public:
+    class arrival_token {
+    public:
+        arrival_token() = default;
+        ~arrival_token() = default;
+        _CUDA_AWBARRIER_QUALIFIER uint32_t pending_count() const;
+    private:
+        _CUDA_AWBARRIER_QUALIFIER arrival_token(uint64_t token);
+        uint64_t token;
+        friend awbarrier;
+    };
+    awbarrier() = default;
+    awbarrier(const awbarrier&) = delete;
+    awbarrier& operator=(const awbarrier&) = delete;
+    ~awbarrier() = default;
+    _CUDA_AWBARRIER_QUALIFIER arrival_token arrive();
+    _CUDA_AWBARRIER_QUALIFIER arrival_token arrive_and_drop();
+    _CUDA_AWBARRIER_QUALIFIER bool timed_wait(arrival_token token, uint32_t hint_cycles);
+    _CUDA_AWBARRIER_QUALIFIER void wait(arrival_token token);
+    _CUDA_AWBARRIER_QUALIFIER void arrive_and_wait();
+    _CUDA_AWBARRIER_STATIC_QUALIFIER __host__ constexpr uint32_t max();
+private:
+    uint64_t barrier;
+    friend _CUDA_AWBARRIER_QUALIFIER void init(awbarrier* barrier, uint32_t expected_count);
+    friend _CUDA_AWBARRIER_QUALIFIER void inval(awbarrier* barrier);
+    friend class pipeline;
+};
+_CUDA_AWBARRIER_QUALIFIER
+uint32_t awbarrier::arrival_token::pending_count() const
+{
+    const uint32_t pending_count = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(this->token);
+#if (__CUDA_ARCH__ >= 900)
+    return pending_count;
+#else
+    return (pending_count >> 15);
+#endif
+}
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token::arrival_token(uint64_t token)
+    : token(token)
+{
+}
+_CUDA_AWBARRIER_QUALIFIER
+void init(awbarrier* barrier, uint32_t expected_count)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT);
+#if (__CUDA_ARCH__ >= 900)
+    const uint32_t init_count = expected_count;
+#else
+    const uint32_t init_count = (expected_count << 15) + expected_count;
+#endif
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(&barrier->barrier, init_count);
+}
+_CUDA_AWBARRIER_QUALIFIER
+void inval(awbarrier* barrier)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(&barrier->barrier);
+}
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token awbarrier::arrive()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+ #if (__CUDA_ARCH__ < 900)
+    const uint32_t arrive_count = 1 << 15;
+    const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<false>(&this->barrier, arrive_count);
+    (void)
+#else
+    const uint64_t token =
+ #endif
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(&this->barrier);
+    return arrival_token(token);
+}
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token awbarrier::arrive_and_drop()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+ #if (__CUDA_ARCH__ < 900)
+    const uint32_t arrive_count = 1 << 15;
+    const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<true>(&this->barrier, arrive_count);
+    (void)
+#else
+    const uint64_t token =
+ #endif
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(&this->barrier);
+    return arrival_token(token);
+}
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier::timed_wait(arrival_token token, uint32_t hint_cycles)
+{
+    constexpr uint64_t max_busy_wait_cycles = 1024;
+    constexpr uint32_t max_sleep_ns = 1 << 20;
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+    if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
+        return true;
+    }
+    uint64_t start_cycles = clock64();
+    uint64_t elapsed_cycles = 0;
+    uint32_t sleep_ns = 32;
+    while (elapsed_cycles < hint_cycles) {
+        if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
+            return true;
+        }
+        if (elapsed_cycles > max_busy_wait_cycles) {
+            __nanosleep(sleep_ns);
+            if (sleep_ns < max_sleep_ns) {
+                sleep_ns *= 2;
+            }
+        }
+        elapsed_cycles = clock64() - start_cycles;
+    }
+    return false;
+}
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier::wait(arrival_token token)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+    while (!timed_wait(token, ~0u));
+}
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier::arrive_and_wait()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+    this->wait(this->arrive());
+}
+_CUDA_AWBARRIER_QUALIFIER __host__
+constexpr uint32_t awbarrier::max()
+{
+    return _CUDA_AWBARRIER_MAX_COUNT;
+}
+_CUDA_AWBARRIER_END_NAMESPACE
+#endif /* !_CUDA_AWBARRIER_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h ADDED Viewed

	@@ -0,0 +1,350 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CUDA_AWBARRIER_HELPERS_H_
+#define _CUDA_AWBARRIER_HELPERS_H_
+#define _CUDA_AWBARRIER_NAMESPACE       nvcuda::experimental
+#define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
+#define _CUDA_AWBARRIER_END_NAMESPACE   } }
+#define _CUDA_AWBARRIER_INTERNAL_NAMESPACE       _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal
+#define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal {
+#define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE   } _CUDA_AWBARRIER_END_NAMESPACE
+# if !defined(_CUDA_AWBARRIER_QUALIFIER)
+#  define _CUDA_AWBARRIER_QUALIFIER inline __device__
+# endif
+# if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER)
+#  define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__
+#endif
+#if defined(__CUDA_ARCH__)
+#if  (__CUDA_ARCH__ >= 800)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_80
+#elif (__CUDA_ARCH__ >= 700)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
+#endif // No support < 700
+#else // !defined(__CUDA_ARCH__)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
+#endif // defined(__CUDA_ARCH__)
+#define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1)
+#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
+# define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER
+#endif
+#if !defined(_CUDA_AWBARRIER_DEBUG)
+# if defined(__CUDACC_DEBUG__)
+#  define _CUDA_AWBARRIER_DEBUG 1
+# else
+#  define _CUDA_AWBARRIER_DEBUG 0
+# endif
+#endif
+#if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG)
+# if !defined(__CUDACC_RTC__)
+#  include <cassert>
+# endif
+# define _CUDA_AWBARRIER_ASSERT(x) assert((x));
+# define _CUDA_AWBARRIER_ABORT() assert(0);
+#else
+# define _CUDA_AWBARRIER_ASSERT(x)
+# define _CUDA_AWBARRIER_ABORT() __trap();
+#endif
+#if defined(__CUDACC_RTC__)
+typedef unsigned short     uint16_t;
+typedef unsigned int       uint32_t;
+typedef unsigned long long uint64_t;
+typedef uint64_t           uintptr_t;
+#else
+# include <stdint.h>
+#endif
+#if defined(_CUDA_AWBARRIER_SM_TARGET)
+typedef uint64_t __mbarrier_t;
+typedef uint64_t __mbarrier_token_t;
+_CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE
+extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
+namespace _CUDA_AWBARRIER_SM_70 {
+    union AWBarrier {
+        struct {
+            uint32_t expected;
+            uint32_t pending;
+        } split;
+        uint64_t raw;
+    };
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+        awbarrier->split.expected = 0x40000000 - expected_count;
+        awbarrier->split.pending = 0x80000000 - expected_count;
+    }
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    void __awbarrier_inval(uint64_t* barrier) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    }
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint32_t __awbarrier_token_pending_count(uint64_t token) {
+        const uint32_t pending = token >> 32;
+        return 0x80000000 - (pending & 0x7fffffff);
+    }
+    template<bool _Drop>
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint64_t __awbarrier_arrive_drop(uint64_t* barrier) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+        while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
+        if (_Drop) {
+            (void)atomicAdd_block(&awbarrier->split.expected, 1);
+        }
+        __threadfence_block();
+        const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1);
+        const uint32_t new_pending = old_pending + 1;
+        const bool reset = (old_pending ^ new_pending) & 0x80000000;
+        if (reset) {
+            __threadfence_block();
+            uint32_t new_expected = *reinterpret_cast<volatile uint32_t*>(&awbarrier->split.expected);
+            new_expected &= ~0x40000000;
+            if (new_expected & 0x20000000) {
+                new_expected |= 0x40000000;
+            }
+            atomicAdd_block(&awbarrier->split.pending, new_expected);
+        }
+        return static_cast<uint64_t>(old_pending) << 32;
+    }
+    template<bool _Drop>
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+        while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
+        if (_Drop) {
+            (void)atomicAdd_block(&awbarrier->split.expected, count);
+        }
+        return static_cast<uint64_t>(atomicAdd_block(&awbarrier->split.pending, count)) << 32;
+    }
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        volatile AWBarrier* awbarrier = reinterpret_cast<volatile AWBarrier*>(barrier);
+        return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000;
+    }
+}; // namespace _CUDA_AWBARRIER_SM_70
+namespace _CUDA_AWBARRIER_SM_80 {
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
+        asm volatile ("mbarrier.init.shared.b64 [%0], %1;"
+                :
+                : "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count)
+                : "memory");
+    }
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    void __awbarrier_inval(uint64_t* barrier) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        asm volatile ("mbarrier.inval.shared.b64 [%0];"
+                :
+                : "r"(__nvvm_get_smem_pointer(barrier))
+                : "memory");
+    }
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint32_t __awbarrier_token_pending_count(uint64_t token) {
+        uint32_t __pending_count;
+        asm ("mbarrier.pending_count.b64 %0, %1;"
+                : "=r"(__pending_count)
+                : "l"(token));
+        return __pending_count;
+    }
+    template<bool _Drop>
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint64_t __awbarrier_arrive_drop(uint64_t* barrier) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        uint64_t token;
+        if (_Drop) {
+            asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier))
+                    : "memory");
+        } else {
+            asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier))
+                    : "memory");
+        }
+        return token;
+    }
+    template<bool _Drop>
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
+        uint64_t token;
+        if (_Drop) {
+            asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
+                    : "memory");
+        } else {
+            asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
+                    : "memory");
+        }
+        return token;
+    }
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        uint16_t __wait_complete;
+        asm volatile ("{"
+                "    .reg .pred %%p;"
+                "    mbarrier.test_wait.shared.b64 %%p, [%1], %2;"
+                "    selp.u16 %0, 1, 0, %%p;"
+                "}"
+                : "=h"(__wait_complete)
+                : "r"(__nvvm_get_smem_pointer(barrier)), "l"(token)
+                : "memory");
+        return bool(__wait_complete);
+    }
+}; // namespace _CUDA_AWBARRIER_SM_80
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier_init(uint64_t* barrier, uint32_t expected_count)
+{
+    _CUDA_AWBARRIER_SM_TARGET::__awbarrier_init(barrier, expected_count);
+}
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier_inval(uint64_t* barrier)
+{
+    _CUDA_AWBARRIER_SM_TARGET::__awbarrier_inval(barrier);
+}
+_CUDA_AWBARRIER_QUALIFIER
+uint32_t awbarrier_token_pending_count(uint64_t token)
+{
+    return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_token_pending_count(token);
+}
+template<bool _Drop>
+_CUDA_AWBARRIER_QUALIFIER
+uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t arrive_count)
+{
+    return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop_no_complete<_Drop>(barrier, arrive_count);
+}
+template<bool _Drop>
+_CUDA_AWBARRIER_QUALIFIER
+uint64_t awbarrier_arrive_drop(uint64_t* barrier)
+{
+    return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop<_Drop>(barrier);
+}
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier_test_wait(uint64_t* barrier, uint64_t token)
+{
+    return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_test_wait(barrier, token);
+}
+_CUDA_AWBARRIER_END_INTERNAL_NAMESPACE
+#endif /* defined(_CUDA_AWBARRIER_SM_TARGET) */
+#endif /* !_CUDA_AWBARRIER_HELPERS_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h ADDED Viewed

	@@ -0,0 +1,94 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CUDA_AWBARRIER_PRIMITIVES_H_
+#define _CUDA_AWBARRIER_PRIMITIVES_H_
+#include "cuda_awbarrier_helpers.h"
+#if !defined(_CUDA_AWBARRIER_SM_TARGET)
+# error This file requires compute capability 7.0 or greater.
+#endif
+_CUDA_AWBARRIER_STATIC_QUALIFIER __host__
+uint32_t __mbarrier_maximum_count() {
+    return _CUDA_AWBARRIER_MAX_COUNT;
+}
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+void __mbarrier_init(__mbarrier_t* barrier, uint32_t expected_count) {
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(barrier, expected_count);
+}
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+void __mbarrier_inval(__mbarrier_t* barrier) {
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(barrier);
+}
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+__mbarrier_token_t __mbarrier_arrive(__mbarrier_t* barrier) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(barrier);
+}
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+__mbarrier_token_t __mbarrier_arrive_and_drop(__mbarrier_t* barrier) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(barrier);
+}
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool __mbarrier_test_wait(__mbarrier_t* barrier, __mbarrier_token_t token) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(barrier, token);
+}
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+uint32_t __mbarrier_token_pending_count(__mbarrier_token_t token) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(token);
+}
+#endif /* !_CUDA_AWBARRIER_PRIMITIVES_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h ADDED Viewed

	@@ -0,0 +1,1958 @@

+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/**
+ * CUDA Occupancy Calculator
+ *
+ * NAME
+ *
+ *   cudaOccMaxActiveBlocksPerMultiprocessor,
+ *   cudaOccMaxPotentialOccupancyBlockSize,
+ *   cudaOccMaxPotentialOccupancyBlockSizeVariableSMem
+ *   cudaOccAvailableDynamicSMemPerBlock
+ *
+ * DESCRIPTION
+ *
+ *   The CUDA occupancy calculator provides a standalone, programmatical
+ *   interface to compute the occupancy of a function on a device. It can also
+ *   provide occupancy-oriented launch configuration suggestions.
+ *
+ *   The function and device are defined by the user through
+ *   cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState
+ *   structures. All APIs require all 3 of them.
+ *
+ *   See the structure definition for more details about the device / function
+ *   descriptors.
+ *
+ *   See each API's prototype for API usage.
+ *
+ * COMPATIBILITY
+ *
+ *   The occupancy calculator will be updated on each major CUDA toolkit
+ *   release. It does not provide forward compatibility, i.e. new hardwares
+ *   released after this implementation's release will not be supported.
+ *
+ * NOTE
+ *
+ *   If there is access to CUDA runtime, and the sole intent is to calculate
+ *   occupancy related values on one of the accessible CUDA devices, using CUDA
+ *   runtime's occupancy calculation APIs is recommended.
+ *
+ */
+#ifndef __cuda_occupancy_h__
+#define __cuda_occupancy_h__
+#include <stddef.h>
+#include <limits.h>
+#include <string.h>
+// __OCC_INLINE will be undefined at the end of this header
+//
+#ifdef __CUDACC__
+#define __OCC_INLINE inline __host__ __device__
+#elif defined _MSC_VER
+#define __OCC_INLINE __inline
+#else // GNUCC assumed
+#define __OCC_INLINE inline
+#endif
+enum cudaOccError_enum {
+    CUDA_OCC_SUCCESS              = 0,  // no error encountered
+    CUDA_OCC_ERROR_INVALID_INPUT  = 1,  // input parameter is invalid
+    CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2,  // requested device is not supported in
+                                        // current implementation or device is
+                                        // invalid
+};
+typedef enum cudaOccError_enum       cudaOccError;
+typedef struct cudaOccResult         cudaOccResult;
+typedef struct cudaOccDeviceProp     cudaOccDeviceProp;
+typedef struct cudaOccFuncAttributes cudaOccFuncAttributes;
+typedef struct cudaOccDeviceState    cudaOccDeviceState;
+/**
+ * The CUDA occupancy calculator computes the occupancy of the function
+ * described by attributes with the given block size (blockSize), static device
+ * properties (properties), dynamic device states (states) and per-block dynamic
+ * shared memory allocation (dynamicSMemSize) in bytes, and output it through
+ * result along with other useful information. The occupancy is computed in
+ * terms of the maximum number of active blocks per multiprocessor. The user can
+ * then convert it to other metrics, such as number of active warps.
+ *
+ * RETURN VALUE
+ *
+ * The occupancy and related information is returned through result.
+ *
+ * If result->activeBlocksPerMultiprocessor is 0, then the given parameter
+ * combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ */
+static __OCC_INLINE
+cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
+    cudaOccResult               *result,           // out
+    const cudaOccDeviceProp     *properties,       // in
+    const cudaOccFuncAttributes *attributes,       // in
+    const cudaOccDeviceState    *state,            // in
+    int                          blockSize,        // in
+    size_t                       dynamicSmemSize); // in
+/**
+ * The CUDA launch configurator C API suggests a grid / block size pair (in
+ * minGridSize and blockSize) that achieves the best potential occupancy
+ * (i.e. maximum number of active warps with the smallest number of blocks) for
+ * the given function described by attributes, on a device described by
+ * properties with settings in state.
+ *
+ * If per-block dynamic shared memory allocation is not needed, the user should
+ * leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0.
+ *
+ * If per-block dynamic shared memory allocation is needed, then if the dynamic
+ * shared memory size is constant regardless of block size, the size should be
+ * passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be
+ * NULL.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with different
+ * block sizes, the user needs to provide a pointer to an unary function through
+ * blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by
+ * a block of the function for any given block size. dynamicSMemSize is
+ * ignored. An example signature is:
+ *
+ *    // Take block size, returns dynamic shared memory needed
+ *    size_t blockToSmem(int blockSize);
+ *
+ * RETURN VALUE
+ *
+ * The suggested block size and the minimum number of blocks needed to achieve
+ * the maximum occupancy are returned through blockSize and minGridSize.
+ *
+ * If *blockSize is 0, then the given combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+static __OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,      // out
+    int                         *blockSize,        // out
+    const cudaOccDeviceProp     *properties,       // in
+    const cudaOccFuncAttributes *attributes,       // in
+    const cudaOccDeviceState    *state,            // in
+    size_t                     (*blockSizeToDynamicSMemSize)(int), // in
+    size_t                       dynamicSMemSize); // in
+/**
+ * The CUDA launch configurator C++ API suggests a grid / block size pair (in
+ * minGridSize and blockSize) that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number of blocks)
+ * for the given function described by attributes, on a device described by
+ * properties with settings in state.
+ *
+ * If per-block dynamic shared memory allocation is 0 or constant regardless of
+ * block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to
+ * configure the launch. A constant dynamic shared memory allocation size in
+ * bytes can be passed through dynamicSMemSize.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with different
+ * block sizes, the user needs to use
+ * cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a
+ * functor / pointer to an unary function (blockSizeToDynamicSMemSize) that
+ * computes the dynamic shared memory needed by func for any given block
+ * size. An example signature is:
+ *
+ *  // Take block size, returns per-block dynamic shared memory needed
+ *  size_t blockToSmem(int blockSize);
+ *
+ * RETURN VALUE
+ *
+ * The suggested block size and the minimum number of blocks needed to achieve
+ * the maximum occupancy are returned through blockSize and minGridSize.
+ *
+ * If *blockSize is 0, then the given combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+#if defined(__cplusplus)
+namespace {
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,          // out
+    int                         *blockSize,            // out
+    const cudaOccDeviceProp     *properties,           // in
+    const cudaOccFuncAttributes *attributes,           // in
+    const cudaOccDeviceState    *state,                // in
+    size_t                       dynamicSMemSize = 0); // in
+template <typename UnaryFunction>
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
+    int                         *minGridSize,          // out
+    int                         *blockSize,            // out
+    const cudaOccDeviceProp     *properties,           // in
+    const cudaOccFuncAttributes *attributes,           // in
+    const cudaOccDeviceState    *state,                // in
+    UnaryFunction                blockSizeToDynamicSMemSize); // in
+} // namespace anonymous
+#endif // defined(__cplusplus)
+/**
+ *
+ * The CUDA dynamic shared memory calculator computes the maximum size of
+ * per-block dynamic shared memory if we want to place numBlocks blocks
+ * on an SM.
+ *
+ * RETURN VALUE
+ *
+ * Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow
+ * numBlocks blocks per SM.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+static __OCC_INLINE
+cudaOccError cudaOccAvailableDynamicSMemPerBlock(
+    size_t                      *dynamicSmemSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                         numBlocks,
+    int                         blockSize);
+/**
+ * Data structures
+ *
+ * These structures are subject to change for future architecture and CUDA
+ * releases. C users should initialize the structure as {0}.
+ *
+ */
+/**
+ * Device descriptor
+ *
+ * This structure describes a device.
+ */
+struct cudaOccDeviceProp {
+    int    computeMajor;                // Compute capability major version
+    int    computeMinor;                // Compute capability minor
+                                        // version. None supported minor version
+                                        // may cause error
+    int    maxThreadsPerBlock;          // Maximum number of threads per block
+    int    maxThreadsPerMultiprocessor; // Maximum number of threads per SM
+                                        // i.e. (Max. number of warps) x (warp
+                                        // size)
+    int    regsPerBlock;                // Maximum number of registers per block
+    int    regsPerMultiprocessor;       // Maximum number of registers per SM
+    int    warpSize;                    // Warp size
+    size_t sharedMemPerBlock;           // Maximum shared memory size per block
+    size_t sharedMemPerMultiprocessor;  // Maximum shared memory size per SM
+    int    numSms;                      // Number of SMs available
+    size_t sharedMemPerBlockOptin;      // Maximum optin shared memory size per block
+    size_t reservedSharedMemPerBlock;   // Shared memory per block reserved by driver
+#ifdef __cplusplus
+    // This structure can be converted from a cudaDeviceProp structure for users
+    // that use this header in their CUDA applications.
+    //
+    // If the application have access to the CUDA Runtime API, the application
+    // can obtain the device properties of a CUDA device through
+    // cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the
+    // cudaDeviceProp structure.
+    //
+    // Example:
+    /*
+     {
+         cudaDeviceProp prop;
+         cudaGetDeviceProperties(&prop, ...);
+         cudaOccDeviceProp occProp = prop;
+         ...
+         cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...);
+     }
+     */
+    //
+    template<typename DeviceProp>
+    __OCC_INLINE
+    cudaOccDeviceProp(const DeviceProp &props)
+    :   computeMajor                (props.major),
+        computeMinor                (props.minor),
+        maxThreadsPerBlock          (props.maxThreadsPerBlock),
+        maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor),
+        regsPerBlock                (props.regsPerBlock),
+        regsPerMultiprocessor       (props.regsPerMultiprocessor),
+        warpSize                    (props.warpSize),
+        sharedMemPerBlock           (props.sharedMemPerBlock),
+        sharedMemPerMultiprocessor  (props.sharedMemPerMultiprocessor),
+        numSms                      (props.multiProcessorCount),
+        sharedMemPerBlockOptin      (props.sharedMemPerBlockOptin),
+        reservedSharedMemPerBlock   (props.reservedSharedMemPerBlock)
+    {}
+    __OCC_INLINE
+    cudaOccDeviceProp()
+    :   computeMajor                (0),
+        computeMinor                (0),
+        maxThreadsPerBlock          (0),
+        maxThreadsPerMultiprocessor (0),
+        regsPerBlock                (0),
+        regsPerMultiprocessor       (0),
+        warpSize                    (0),
+        sharedMemPerBlock           (0),
+        sharedMemPerMultiprocessor  (0),
+        numSms                      (0),
+        sharedMemPerBlockOptin      (0),
+        reservedSharedMemPerBlock   (0)
+    {}
+#endif // __cplusplus
+};
+/**
+ * Partitioned global caching option
+ */
+typedef enum cudaOccPartitionedGCConfig_enum {
+    PARTITIONED_GC_OFF,        // Disable partitioned global caching
+    PARTITIONED_GC_ON,         // Prefer partitioned global caching
+    PARTITIONED_GC_ON_STRICT   // Force partitioned global caching
+} cudaOccPartitionedGCConfig;
+/**
+ * Per function opt in maximum dynamic shared memory limit
+ */
+typedef enum cudaOccFuncShmemConfig_enum {
+    FUNC_SHMEM_LIMIT_DEFAULT,   // Default shmem limit
+    FUNC_SHMEM_LIMIT_OPTIN,     // Use the optin shmem limit
+} cudaOccFuncShmemConfig;
+/**
+ * Function descriptor
+ *
+ * This structure describes a CUDA function.
+ */
+struct cudaOccFuncAttributes {
+    int maxThreadsPerBlock; // Maximum block size the function can work with. If
+                            // unlimited, use INT_MAX or any value greater than
+                            // or equal to maxThreadsPerBlock of the device
+    int numRegs;            // Number of registers used. When the function is
+                            // launched on device, the register count may change
+                            // due to internal tools requirements.
+    size_t sharedSizeBytes; // Number of static shared memory used
+    cudaOccPartitionedGCConfig partitionedGCConfig;
+                            // Partitioned global caching is required to enable
+                            // caching on certain chips, such as sm_52
+                            // devices. Partitioned global caching can be
+                            // automatically disabled if the occupancy
+                            // requirement of the launch cannot support caching.
+                            //
+                            // To override this behavior with caching on and
+                            // calculate occupancy strictly according to the
+                            // preference, set partitionedGCConfig to
+                            // PARTITIONED_GC_ON_STRICT. This is especially
+                            // useful for experimenting and finding launch
+                            // configurations (MaxPotentialOccupancyBlockSize)
+                            // that allow global caching to take effect.
+                            //
+                            // This flag only affects the occupancy calculation.
+    cudaOccFuncShmemConfig shmemLimitConfig;
+                            // Certain chips like sm_70 allow a user to opt into
+                            // a higher per block limit of dynamic shared memory
+                            // This optin is performed on a per function basis
+                            // using the cuFuncSetAttribute function
+    size_t maxDynamicSharedSizeBytes;
+                            // User set limit on maximum dynamic shared memory
+                            // usable by the kernel
+                            // This limit is set using the cuFuncSetAttribute
+                            // function.
+    int numBlockBarriers;   // Number of block barriers used (default to 1)
+#ifdef __cplusplus
+    // This structure can be converted from a cudaFuncAttributes structure for
+    // users that use this header in their CUDA applications.
+    //
+    // If the application have access to the CUDA Runtime API, the application
+    // can obtain the function attributes of a CUDA kernel function through
+    // cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the
+    // cudaFuncAttributes structure.
+    //
+    // Example:
+    /*
+      __global__ void foo() {...}
+      ...
+      {
+          cudaFuncAttributes attr;
+          cudaFuncGetAttributes(&attr, foo);
+          cudaOccFuncAttributes occAttr = attr;
+          ...
+          cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...);
+      }
+     */
+    //
+    template<typename FuncAttributes>
+    __OCC_INLINE
+    cudaOccFuncAttributes(const FuncAttributes &attr)
+    :   maxThreadsPerBlock  (attr.maxThreadsPerBlock),
+        numRegs             (attr.numRegs),
+        sharedSizeBytes     (attr.sharedSizeBytes),
+        partitionedGCConfig (PARTITIONED_GC_OFF),
+        shmemLimitConfig    (FUNC_SHMEM_LIMIT_OPTIN),
+        maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes),
+        numBlockBarriers    (1)
+    {}
+    __OCC_INLINE
+    cudaOccFuncAttributes()
+    :   maxThreadsPerBlock  (0),
+        numRegs             (0),
+        sharedSizeBytes     (0),
+        partitionedGCConfig (PARTITIONED_GC_OFF),
+        shmemLimitConfig    (FUNC_SHMEM_LIMIT_DEFAULT),
+        maxDynamicSharedSizeBytes (0),
+        numBlockBarriers    (0)
+    {}
+#endif
+};
+typedef enum cudaOccCacheConfig_enum {
+    CACHE_PREFER_NONE   = 0x00, // no preference for shared memory or L1 (default)
+    CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache
+    CACHE_PREFER_L1     = 0x02, // prefer larger L1 cache and smaller shared memory
+    CACHE_PREFER_EQUAL  = 0x03  // prefer equal sized L1 cache and shared memory
+} cudaOccCacheConfig;
+typedef enum cudaOccCarveoutConfig_enum {
+    SHAREDMEM_CARVEOUT_DEFAULT       = -1,  // no preference for shared memory or L1 (default)
+    SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, // prefer maximum available shared memory, minimum L1 cache
+    SHAREDMEM_CARVEOUT_MAX_L1        = 0,    // prefer maximum available L1 cache, minimum shared memory
+    SHAREDMEM_CARVEOUT_HALF          = 50   // prefer half of maximum available shared memory, with the rest as L1 cache
+} cudaOccCarveoutConfig;
+/**
+ * Device state descriptor
+ *
+ * This structure describes device settings that affect occupancy calculation.
+ */
+struct cudaOccDeviceState
+{
+    // Cache / shared memory split preference. Deprecated on Volta
+    cudaOccCacheConfig cacheConfig;
+    // Shared memory / L1 split preference. Supported on only Volta
+    int carveoutConfig;
+#ifdef __cplusplus
+    __OCC_INLINE
+    cudaOccDeviceState()
+    :   cacheConfig     (CACHE_PREFER_NONE),
+        carveoutConfig  (SHAREDMEM_CARVEOUT_DEFAULT)
+    {}
+#endif
+};
+typedef enum cudaOccLimitingFactor_enum {
+                                    // Occupancy limited due to:
+    OCC_LIMIT_WARPS         = 0x01, // - warps available
+    OCC_LIMIT_REGISTERS     = 0x02, // - registers available
+    OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available
+    OCC_LIMIT_BLOCKS        = 0x08, // - blocks available
+    OCC_LIMIT_BARRIERS      = 0x10  // - barrier available
+} cudaOccLimitingFactor;
+/**
+ * Occupancy output
+ *
+ * This structure contains occupancy calculator's output.
+ */
+struct cudaOccResult {
+    int activeBlocksPerMultiprocessor; // Occupancy
+    unsigned int limitingFactors;      // Factors that limited occupancy. A bit
+                                       // field that counts the limiting
+                                       // factors, see cudaOccLimitingFactor
+    int blockLimitRegs;                // Occupancy due to register
+                                       // usage, INT_MAX if the kernel does not
+                                       // use any register.
+    int blockLimitSharedMem;           // Occupancy due to shared memory
+                                       // usage, INT_MAX if the kernel does not
+                                       // use shared memory.
+    int blockLimitWarps;               // Occupancy due to block size limit
+    int blockLimitBlocks;              // Occupancy due to maximum number of blocks
+                                       // managable per SM
+    int blockLimitBarriers;            // Occupancy due to block barrier usage
+    int allocatedRegistersPerBlock;    // Actual number of registers allocated per
+                                       // block
+    size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated
+                                       // per block
+    cudaOccPartitionedGCConfig partitionedGCConfig;
+                                       // Report if partitioned global caching
+                                       // is actually enabled.
+};
+/**
+ * Partitioned global caching support
+ *
+ * See cudaOccPartitionedGlobalCachingModeSupport
+ */
+typedef enum cudaOccPartitionedGCSupport_enum {
+    PARTITIONED_GC_NOT_SUPPORTED,  // Partitioned global caching is not supported
+    PARTITIONED_GC_SUPPORTED,      // Partitioned global caching is supported
+} cudaOccPartitionedGCSupport;
+/**
+ * Implementation
+ */
+/**
+ * Max compute capability supported
+ */
+#define __CUDA_OCC_MAJOR__ 9
+#define __CUDA_OCC_MINOR__ 0
+//////////////////////////////////////////
+//    Mathematical Helper Functions     //
+//////////////////////////////////////////
+static __OCC_INLINE int __occMin(int lhs, int rhs)
+{
+    return rhs < lhs ? rhs : lhs;
+}
+static __OCC_INLINE int __occDivideRoundUp(int x, int y)
+{
+    return (x + (y - 1)) / y;
+}
+static __OCC_INLINE int __occRoundUp(int x, int y)
+{
+    return y * __occDivideRoundUp(x, y);
+}
+//////////////////////////////////////////
+//      Architectural Properties        //
+//////////////////////////////////////////
+/**
+ * Granularity of shared memory allocation
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+        case 7:
+            value = 256;
+            break;
+        case 8:
+        case 9:
+            value = 128;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    *limit = value;
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Maximum number of registers per thread
+ */
+static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+            value = 255;
+            break;
+        case 7:
+        case 8:
+        case 9:
+            value = 256;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    *limit = value;
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Granularity of register allocation
+ */
+static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+        case 7:
+        case 8:
+        case 9:
+            value = 256;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    *limit = value;
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Number of sub-partitions
+ */
+static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 7:
+        case 8:
+        case 9:
+            value = 4;
+            break;
+        case 6:
+            value = properties->computeMinor ? 4 : 2;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    *limit = value;
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Maximum number of blocks that can run simultaneously on a multiprocessor
+ */
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+    switch(properties->computeMajor) {
+        case 3:
+            value = 16;
+            break;
+        case 5:
+        case 6:
+            value = 32;
+            break;
+        case 7: {
+            int isTuring = properties->computeMinor == 5;
+            value = (isTuring) ? 16 : 32;
+            break;
+        }
+        case 8:
+            if (properties->computeMinor == 0) {
+                value = 32;
+            }
+            else if (properties->computeMinor == 9) {
+                value = 24;
+            }
+            else {
+                value = 16;
+            }
+            break;
+        case 9:
+            value = 32;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    *limit = value;
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Align up shared memory based on compute major configurations
+ */
+static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties)
+{
+    // Volta and Turing have shared L1 cache / shared memory, and support cache
+    // configuration to trade one for the other. These values are needed to
+    // map carveout config ratio to the next available architecture size
+    size_t size = *shMemSize;
+    switch (properties->computeMajor) {
+    case 7: {
+        // Turing supports 32KB and 64KB shared mem.
+        int isTuring = properties->computeMinor == 5;
+        if (isTuring) {
+            if      (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        // Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem.
+        else {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 96 * 1024) {
+                *shMemSize = 96 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        break;
+    }
+    case 8:
+        if (properties->computeMinor == 0 || properties->computeMinor == 7) {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 100 * 1024) {
+                *shMemSize = 100 * 1024;
+            }
+            else if (size <= 132 * 1024) {
+                *shMemSize = 132 * 1024;
+            }
+            else if (size <= 164 * 1024) {
+                *shMemSize = 164 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        else {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 100 * 1024) {
+                *shMemSize = 100 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        break;
+    case 9: {
+        if      (size == 0) {
+            *shMemSize = 0;
+        }
+        else if (size <= 8 * 1024) {
+            *shMemSize = 8 * 1024;
+        }
+        else if (size <= 16 * 1024) {
+            *shMemSize = 16 * 1024;
+        }
+        else if (size <= 32 * 1024) {
+            *shMemSize = 32 * 1024;
+        }
+        else if (size <= 64 * 1024) {
+            *shMemSize = 64 * 1024;
+        }
+        else if (size <= 100 * 1024) {
+            *shMemSize = 100 * 1024;
+        }
+        else if (size <= 132 * 1024) {
+            *shMemSize = 132 * 1024;
+        }
+        else if (size <= 164 * 1024) {
+            *shMemSize = 164 * 1024;
+        }
+        else if (size <= 196 * 1024) {
+            *shMemSize = 196 * 1024;
+        }
+        else if (size <= 228 * 1024) {
+            *shMemSize = 228 * 1024;
+        }
+        else {
+            return CUDA_OCC_ERROR_INVALID_INPUT;
+        }
+        break;
+    }
+    default:
+        return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Shared memory based on the new carveoutConfig API introduced with Volta
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    size_t preferenceShmemSize;
+    // CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported
+    // devices. This preference will take precedence over the older cacheConfig setting.
+    // Map cacheConfig to its effective preference value.
+    int effectivePreference = state->carveoutConfig;
+    if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
+        switch (state->cacheConfig)
+        {
+        case CACHE_PREFER_L1:
+            effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1;
+            break;
+        case CACHE_PREFER_SHARED:
+            effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED;
+            break;
+        case CACHE_PREFER_EQUAL:
+            effectivePreference = SHAREDMEM_CARVEOUT_HALF;
+            break;
+        default:
+            effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT;
+            break;
+        }
+    }
+    if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
+        preferenceShmemSize = properties->sharedMemPerMultiprocessor;
+    }
+    else {
+        preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100;
+    }
+    status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties);
+    *limit = preferenceShmemSize;
+    return status;
+}
+/**
+ * Shared memory based on the cacheConfig
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    size_t bytes                          = 0;
+    size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor;
+    cudaOccCacheConfig cacheConfig        = state->cacheConfig;
+    // Kepler has shared L1 cache / shared memory, and support cache
+    // configuration to trade one for the other. These values are needed to
+    // calculate the correct shared memory size for user requested cache
+    // configuration.
+    //
+    size_t minCacheSize                   = 16384;
+    size_t maxCacheSize                   = 49152;
+    size_t cacheAndSharedTotal            = sharedMemPerMultiprocessorHigh + minCacheSize;
+    size_t sharedMemPerMultiprocessorLow  = cacheAndSharedTotal - maxCacheSize;
+    switch (properties->computeMajor) {
+        case 3:
+            // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
+            // is shared memory.
+            //
+            switch (cacheConfig) {
+                default :
+                case CACHE_PREFER_NONE:
+                case CACHE_PREFER_SHARED:
+                    bytes = sharedMemPerMultiprocessorHigh;
+                    break;
+                case CACHE_PREFER_L1:
+                    bytes = sharedMemPerMultiprocessorLow;
+                    break;
+                case CACHE_PREFER_EQUAL:
+                    // Equal is the mid-point between high and low. It should be
+                    // equivalent to low + 16KB.
+                    //
+                    bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
+                    break;
+            }
+            break;
+        case 5:
+        case 6:
+            // Maxwell and Pascal have dedicated shared memory.
+            //
+            bytes = sharedMemPerMultiprocessorHigh;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    *limit = bytes;
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Shared memory based on config requested by User
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    // Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference,
+    // it is handled separately from the cache config preference.
+    if (properties->computeMajor >= 7) {
+        return cudaOccSMemPreferenceVoltaPlus(limit, properties, state);
+    }
+    return cudaOccSMemPreference(limit, properties, state);
+}
+/**
+ * Return the per block shared memory limit based on function config
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta)
+{
+    switch (properties->computeMajor) {
+        case 2:
+        case 3:
+        case 4:
+        case 5:
+        case 6:
+            *limit = properties->sharedMemPerBlock;
+            break;
+        case 7:
+        case 8:
+        case 9:
+            switch (shmemLimitConfig) {
+                default:
+                case FUNC_SHMEM_LIMIT_DEFAULT:
+                    *limit = properties->sharedMemPerBlock;
+                    break;
+                case FUNC_SHMEM_LIMIT_OPTIN:
+                    if (smemPerCta > properties->sharedMemPerBlock) {
+                        *limit = properties->sharedMemPerBlockOptin;
+                    }
+                    else {
+                        *limit = properties->sharedMemPerBlock;
+                    }
+                    break;
+            }
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+    // Starting Ampere, CUDA driver reserves additional shared memory per block
+    if (properties->computeMajor >= 8) {
+        *limit += properties->reservedSharedMemPerBlock;
+    }
+    return CUDA_OCC_SUCCESS;
+}
+/**
+ * Partitioned global caching mode support
+ */
+static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties)
+{
+    *limit = PARTITIONED_GC_NOT_SUPPORTED;
+    if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) ||
+        properties->computeMajor == 6) {
+        *limit = PARTITIONED_GC_SUPPORTED;
+    }
+    if (properties->computeMajor == 6 && properties->computeMinor == 0) {
+        *limit = PARTITIONED_GC_NOT_SUPPORTED;
+    }
+    return CUDA_OCC_SUCCESS;
+}
+///////////////////////////////////////////////
+//            User Input Sanity              //
+///////////////////////////////////////////////
+static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties)
+{
+    // Verify device properties
+    //
+    // Each of these limits must be a positive number.
+    //
+    // Compute capacity is checked during the occupancy calculation
+    //
+    if (properties->maxThreadsPerBlock          <= 0 ||
+        properties->maxThreadsPerMultiprocessor <= 0 ||
+        properties->regsPerBlock                <= 0 ||
+        properties->regsPerMultiprocessor       <= 0 ||
+        properties->warpSize                    <= 0 ||
+        properties->sharedMemPerBlock           <= 0 ||
+        properties->sharedMemPerMultiprocessor  <= 0 ||
+        properties->numSms                      <= 0) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    return CUDA_OCC_SUCCESS;
+}
+static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes)
+{
+    // Verify function attributes
+    //
+    if (attributes->maxThreadsPerBlock <= 0 ||
+        attributes->numRegs < 0) {            // Compiler may choose not to use
+                                              // any register (empty kernels,
+                                              // etc.)
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    return CUDA_OCC_SUCCESS;
+}
+static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state)
+{
+    (void)state;   // silence unused-variable warning
+    // Placeholder
+    //
+    return CUDA_OCC_SUCCESS;
+}
+static __OCC_INLINE cudaOccError cudaOccInputCheck(
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    status = cudaOccDevicePropCheck(properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    status = cudaOccFuncAttributesCheck(attributes);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    status = cudaOccDeviceStateCheck(state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    return status;
+}
+///////////////////////////////////////////////
+//    Occupancy calculation Functions        //
+///////////////////////////////////////////////
+static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes)
+{
+    cudaOccPartitionedGCSupport gcSupport;
+    cudaOccPartitionedGCConfig gcConfig;
+    cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties);
+    gcConfig = attributes->partitionedGCConfig;
+    if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) {
+        gcConfig = PARTITIONED_GC_OFF;
+    }
+    return gcConfig;
+}
+// Warp limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit(
+    int                         *limit,
+    cudaOccPartitionedGCConfig   gcConfig,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    int                          blockSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int maxWarpsPerSm;
+    int warpsAllocatedPerCTA;
+    int maxBlocks;
+    (void)attributes;   // silence unused-variable warning
+    if (blockSize > properties->maxThreadsPerBlock) {
+        maxBlocks = 0;
+    }
+    else {
+        maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize;
+        warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
+        maxBlocks = 0;
+        if (gcConfig != PARTITIONED_GC_OFF) {
+            int maxBlocksPerSmPartition;
+            int maxWarpsPerSmPartition;
+            // If partitioned global caching is on, then a CTA can only use a SM
+            // partition (a half SM), and thus a half of the warp slots
+            // available per SM
+            //
+            maxWarpsPerSmPartition  = maxWarpsPerSm / 2;
+            maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
+            maxBlocks               = maxBlocksPerSmPartition * 2;
+        }
+        // On hardware that supports partitioned global caching, each half SM is
+        // guaranteed to support at least 32 warps (maximum number of warps of a
+        // CTA), so caching will not cause 0 occupancy due to insufficient warp
+        // allocation slots.
+        //
+        else {
+            maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
+        }
+    }
+    *limit = maxBlocks;
+    return status;
+}
+// Shared memory limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit(
+    int                         *limit,
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                          blockSize,
+    size_t                       dynamicSmemSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int allocationGranularity;
+    size_t userSmemPreference = 0;
+    size_t totalSmemUsagePerCTA;
+    size_t maxSmemUsagePerCTA;
+    size_t smemAllocatedPerCTA;
+    size_t staticSmemSize;
+    size_t sharedMemPerMultiprocessor;
+    size_t smemLimitPerCTA;
+    int maxBlocks;
+    int dynamicSmemSizeExceeded = 0;
+    int totalSmemSizeExceeded = 0;
+    (void)blockSize;   // silence unused-variable warning
+    status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Obtain the user preferred shared memory size. This setting is ignored if
+    // user requests more shared memory than preferred.
+    //
+    status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock;
+    totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize;
+    smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);
+    maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes;
+    dynamicSmemSizeExceeded = 0;
+    totalSmemSizeExceeded   = 0;
+    // Obtain the user set maximum dynamic size if it exists
+    // If so, the current launch dynamic shared memory must not
+    // exceed the set limit
+    if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT &&
+        dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) {
+        dynamicSmemSizeExceeded = 1;
+    }
+    status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    if (smemAllocatedPerCTA > smemLimitPerCTA) {
+        totalSmemSizeExceeded = 1;
+    }
+    if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) {
+        maxBlocks = 0;
+    }
+    else {
+        // User requested shared memory limit is used as long as it is greater
+        // than the total shared memory used per CTA, i.e. as long as at least
+        // one CTA can be launched.
+        if (userSmemPreference >= smemAllocatedPerCTA) {
+            sharedMemPerMultiprocessor = userSmemPreference;
+        }
+        else {
+            // On Volta+, user requested shared memory will limit occupancy
+            // if it's less than shared memory per CTA. Otherwise, the
+            // maximum shared memory limit is used.
+            if (properties->computeMajor >= 7) {
+                sharedMemPerMultiprocessor = smemAllocatedPerCTA;
+                status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties);
+                if (status != CUDA_OCC_SUCCESS) {
+                    return status;
+                }
+            }
+            else {
+                sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor;
+            }
+        }
+        if (smemAllocatedPerCTA > 0) {
+            maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
+        }
+        else {
+            maxBlocks = INT_MAX;
+        }
+    }
+    result->allocatedSharedMemPerBlock = smemAllocatedPerCTA;
+    *limit = maxBlocks;
+    return status;
+}
+static __OCC_INLINE
+cudaOccError cudaOccMaxBlocksPerSMRegsLimit(
+    int                         *limit,
+    cudaOccPartitionedGCConfig  *gcConfig,
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    int                          blockSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int allocationGranularity;
+    int warpsAllocatedPerCTA;
+    int regsAllocatedPerCTA;
+    int regsAssumedPerCTA;
+    int regsPerWarp;
+    int regsAllocatedPerWarp;
+    int numSubPartitions;
+    int numRegsPerSubPartition;
+    int numWarpsPerSubPartition;
+    int numWarpsPerSM;
+    int maxBlocks;
+    int maxRegsPerThread;
+    status = cudaOccRegAllocationGranularity(
+        &allocationGranularity,
+        properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    status = cudaOccRegAllocationMaxPerThread(
+        &maxRegsPerThread,
+        properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
+    // GPUs of compute capability 2.x and higher allocate registers to warps
+    //
+    // Number of regs per warp is regs per thread x warp size, rounded up to
+    // register allocation granularity
+    //
+    regsPerWarp          = attributes->numRegs * properties->warpSize;
+    regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
+    regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;
+    // Hardware verifies if a launch fits the per-CTA register limit. For
+    // historical reasons, the verification logic assumes register
+    // allocations are made to all partitions simultaneously. Therefore, to
+    // simulate the hardware check, the warp allocation needs to be rounded
+    // up to the number of partitions.
+    //
+    regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);
+    if (properties->regsPerBlock < regsAssumedPerCTA ||   // Hardware check
+        properties->regsPerBlock < regsAllocatedPerCTA || // Software check
+        attributes->numRegs > maxRegsPerThread) {         // Per thread limit check
+        maxBlocks = 0;
+    }
+    else {
+        if (regsAllocatedPerWarp > 0) {
+            // Registers are allocated in each sub-partition. The max number
+            // of warps that can fit on an SM is equal to the max number of
+            // warps per sub-partition x number of sub-partitions.
+            //
+            numRegsPerSubPartition  = properties->regsPerMultiprocessor / numSubPartitions;
+            numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;
+            maxBlocks = 0;
+            if (*gcConfig != PARTITIONED_GC_OFF) {
+                int numSubPartitionsPerSmPartition;
+                int numWarpsPerSmPartition;
+                int maxBlocksPerSmPartition;
+                // If partitioned global caching is on, then a CTA can only
+                // use a half SM, and thus a half of the registers available
+                // per SM
+                //
+                numSubPartitionsPerSmPartition = numSubPartitions / 2;
+                numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
+                maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
+                maxBlocks                      = maxBlocksPerSmPartition * 2;
+            }
+            // Try again if partitioned global caching is not enabled, or if
+            // the CTA cannot fit on the SM with caching on (maxBlocks == 0).  In the latter
+            // case, the device will automatically turn off caching, except
+            // if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate
+            // occupancy and launch configuration.
+            //
+            if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) {
+               // In case *gcConfig was PARTITIONED_GC_ON flip it OFF since
+               // this is what it will be if we spread CTA across partitions.
+               //
+               *gcConfig = PARTITIONED_GC_OFF;
+               numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
+               maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
+            }
+        }
+        else {
+            maxBlocks = INT_MAX;
+        }
+    }
+    result->allocatedRegistersPerBlock = regsAllocatedPerCTA;
+    *limit = maxBlocks;
+    return status;
+}
+// Barrier limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit(
+    int                         *limit,
+    int                          ctaLimitBlocks,
+    const cudaOccFuncAttributes *attributes)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int numBarriersAvailable = ctaLimitBlocks * 2;
+    int numBarriersUsed = attributes->numBlockBarriers;
+    int maxBlocks = INT_MAX;
+    if (numBarriersUsed) {
+        maxBlocks = numBarriersAvailable / numBarriersUsed;
+    }
+    *limit = maxBlocks;
+    return status;
+}
+///////////////////////////////////
+//      API Implementations      //
+///////////////////////////////////
+static __OCC_INLINE
+cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                          blockSize,
+    size_t                       dynamicSmemSize)
+{
+    cudaOccError status          = CUDA_OCC_SUCCESS;
+    int          ctaLimitWarps   = 0;
+    int          ctaLimitBlocks  = 0;
+    int          ctaLimitSMem    = 0;
+    int          ctaLimitRegs    = 0;
+    int          ctaLimitBars    = 0;
+    int          ctaLimit        = 0;
+    unsigned int limitingFactors = 0;
+    cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF;
+    if (!result || !properties || !attributes || !state || blockSize <= 0) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    ///////////////////////////
+    // Initialization
+    ///////////////////////////
+    gcConfig = cudaOccPartitionedGCExpected(properties, attributes);
+    ///////////////////////////
+    // Compute occupancy
+    ///////////////////////////
+    // Limits due to registers/SM
+    // Also compute if partitioned global caching has to be turned off
+    //
+    status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4.
+    // As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x.
+    // For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0),
+    // we do not let it run on any Pascal processor, even though it may be able to run on GP100.
+    // Therefore, we check the occupancy on GP10x when it can run on GP100
+    //
+    if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) {
+        cudaOccDeviceProp propertiesGP10x;
+        cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig;
+        int ctaLimitRegsGP10x = 0;
+        // Set up properties for GP10x
+        memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x));
+        propertiesGP10x.computeMinor = 1;
+        status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+        if (ctaLimitRegsGP10x == 0) {
+            ctaLimitRegs = 0;
+        }
+    }
+    // Limits due to warps/SM
+    //
+    status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Limits due to blocks/SM
+    //
+    status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Limits due to shared memory/SM
+    //
+    status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    ///////////////////////////
+    // Overall occupancy
+    ///////////////////////////
+    // Overall limit is min() of limits due to above reasons
+    //
+    ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));
+    // Determine occupancy limiting factors
+    //
+    if (ctaLimit == ctaLimitWarps) {
+        limitingFactors |= OCC_LIMIT_WARPS;
+    }
+    if (ctaLimit == ctaLimitRegs) {
+        limitingFactors |= OCC_LIMIT_REGISTERS;
+    }
+    if (ctaLimit == ctaLimitSMem) {
+        limitingFactors |= OCC_LIMIT_SHARED_MEMORY;
+    }
+    if (ctaLimit == ctaLimitBlocks) {
+        limitingFactors |= OCC_LIMIT_BLOCKS;
+    }
+    // For Hopper onwards compute the limits to occupancy based on block barrier count
+    //
+    if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) {
+        // Limits due to barrier/SM
+        //
+        status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, attributes);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+        // Recompute overall limit based on barrier/SM
+        //
+        ctaLimit = __occMin(ctaLimitBars, ctaLimit);
+        // Determine if this is occupancy limiting factor
+        //
+        if (ctaLimit == ctaLimitBars) {
+            limitingFactors |= OCC_LIMIT_BARRIERS;
+        }
+    }
+    else {
+        ctaLimitBars = INT_MAX;
+    }
+    // Fill in the return values
+    //
+    result->limitingFactors = limitingFactors;
+    result->blockLimitRegs      = ctaLimitRegs;
+    result->blockLimitSharedMem = ctaLimitSMem;
+    result->blockLimitWarps     = ctaLimitWarps;
+    result->blockLimitBlocks    = ctaLimitBlocks;
+    result->blockLimitBarriers  = ctaLimitBars;
+    result->partitionedGCConfig = gcConfig;
+    // Final occupancy
+    result->activeBlocksPerMultiprocessor = ctaLimit;
+    return CUDA_OCC_SUCCESS;
+}
+static __OCC_INLINE
+cudaOccError cudaOccAvailableDynamicSMemPerBlock(
+    size_t                      *bytesAvailable,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                         numBlocks,
+    int                         blockSize)
+{
+    int allocationGranularity;
+    size_t smemLimitPerBlock;
+    size_t smemAvailableForDynamic;
+    size_t userSmemPreference = 0;
+    size_t sharedMemPerMultiprocessor;
+    cudaOccResult result;
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    if (numBlocks <= 0)
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    // First compute occupancy of potential kernel launch.
+    //
+    status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Check if occupancy is achievable given user requested number of blocks.
+    //
+    if (result.activeBlocksPerMultiprocessor < numBlocks) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Return the per block shared memory limit based on function config.
+    //
+    status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // If there is only a single block needed per SM, then the user preference can be ignored and the fully SW
+    // limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user
+    // preference sets the total limit of available shared memory.
+    //
+    cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
+    if (numBlocks == 1) {
+        sharedMemPerMultiprocessor = smemLimitPerBlock;
+    }
+    else {
+        if (!userSmemPreference) {
+            userSmemPreference = 1 ;
+            status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties);
+            if (status != CUDA_OCC_SUCCESS) {
+                return status;
+            }
+        }
+        sharedMemPerMultiprocessor = userSmemPreference;
+    }
+    // Compute total shared memory available per SM
+    //
+    smemAvailableForDynamic =  sharedMemPerMultiprocessor / numBlocks;
+    smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity;
+    // Cap shared memory
+    //
+    if (smemAvailableForDynamic > smemLimitPerBlock) {
+        smemAvailableForDynamic = smemLimitPerBlock;
+    }
+    // Now compute dynamic shared memory size
+    smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes;
+    // Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute()
+    //
+    if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes)
+        smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes;
+    *bytesAvailable = smemAvailableForDynamic;
+    return CUDA_OCC_SUCCESS;
+}
+static __OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    size_t                     (*blockSizeToDynamicSMemSize)(int),
+    size_t                       dynamicSMemSize)
+{
+    cudaOccError  status = CUDA_OCC_SUCCESS;
+    cudaOccResult result;
+    // Limits
+    int occupancyLimit;
+    int granularity;
+    int blockSizeLimit;
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+    if (!minGridSize || !blockSize || !properties || !attributes || !state) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+    occupancyLimit = properties->maxThreadsPerMultiprocessor;
+    granularity    = properties->warpSize;
+    blockSizeLimit        = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
+    blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
+        // Ignore dynamicSMemSize if the user provides a mapping
+        //
+        if (blockSizeToDynamicSMemSize) {
+            dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry);
+        }
+        status = cudaOccMaxActiveBlocksPerMultiprocessor(
+            &result,
+            properties,
+            attributes,
+            state,
+            blockSizeToTry,
+            dynamicSMemSize);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+        occupancyInBlocks = result.activeBlocksPerMultiprocessor;
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * properties->numSms;
+    *blockSize = maxBlockSize;
+    return status;
+}
+#if defined(__cplusplus)
+namespace {
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    size_t                       dynamicSMemSize)
+{
+    return cudaOccMaxPotentialOccupancyBlockSize(
+        minGridSize,
+        blockSize,
+        properties,
+        attributes,
+        state,
+        NULL,
+        dynamicSMemSize);
+}
+template <typename UnaryFunction>
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    UnaryFunction                blockSizeToDynamicSMemSize)
+{
+    cudaOccError  status = CUDA_OCC_SUCCESS;
+    cudaOccResult result;
+    // Limits
+    int occupancyLimit;
+    int granularity;
+    int blockSizeLimit;
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+    size_t dynamicSMemSize;
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+    if (!minGridSize || !blockSize || !properties || !attributes || !state) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+    occupancyLimit = properties->maxThreadsPerMultiprocessor;
+    granularity    = properties->warpSize;
+    blockSizeLimit        = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
+    blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
+        dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
+        status = cudaOccMaxActiveBlocksPerMultiprocessor(
+            &result,
+            properties,
+            attributes,
+            state,
+            blockSizeToTry,
+            dynamicSMemSize);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+        occupancyInBlocks = result.activeBlocksPerMultiprocessor;
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * properties->numSms;
+    *blockSize = maxBlockSize;
+    return status;
+}
+} // namespace anonymous
+#endif /*__cplusplus */
+#undef __OCC_INLINE
+#endif /*__cuda_occupancy_h__*/

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h ADDED Viewed

	@@ -0,0 +1,224 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CUDA_PIPELINE_H_
+# define _CUDA_PIPELINE_H_
+# include "cuda_pipeline_primitives.h"
+# if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
+#  error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+# endif
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+#  include "cuda_awbarrier.h"
+# endif
+// Integration with libcu++'s cuda::barrier<cuda::thread_scope_block>.
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+#  if defined(_LIBCUDACXX_CUDA_ABI_VERSION)
+#   define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION
+#  else
+#   define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4
+#  endif
+#  define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y
+#  define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y)
+#  define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION)
+namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE {
+    struct __block_scope_barrier_base;
+}}
+# endif
+_CUDA_PIPELINE_BEGIN_NAMESPACE
+template<size_t N, typename T>
+_CUDA_PIPELINE_QUALIFIER
+auto segment(T* ptr) -> T(*)[N];
+class pipeline {
+public:
+    pipeline(const pipeline&) = delete;
+    pipeline(pipeline&&) = delete;
+    pipeline& operator=(const pipeline&) = delete;
+    pipeline& operator=(pipeline&&) = delete;
+    _CUDA_PIPELINE_QUALIFIER pipeline();
+    _CUDA_PIPELINE_QUALIFIER size_t commit();
+    _CUDA_PIPELINE_QUALIFIER void commit_and_wait();
+    _CUDA_PIPELINE_QUALIFIER void wait(size_t batch);
+    template<unsigned N>
+    _CUDA_PIPELINE_QUALIFIER void wait_prior();
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+    _CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier);
+    _CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier);
+# endif
+private:
+    size_t current_batch;
+};
+template<class T>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T& dst, const T& src, pipeline& pipe);
+template<class T, size_t DstN, size_t SrcN>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe);
+template<size_t N, typename T>
+_CUDA_PIPELINE_QUALIFIER
+auto segment(T* ptr) -> T(*)[N]
+{
+    return (T(*)[N])ptr;
+}
+_CUDA_PIPELINE_QUALIFIER
+pipeline::pipeline()
+    : current_batch(0)
+{
+}
+_CUDA_PIPELINE_QUALIFIER
+size_t pipeline::commit()
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
+    return this->current_batch++;
+}
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::commit_and_wait()
+{
+    (void)pipeline::commit();
+    pipeline::wait_prior<0>();
+}
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::wait(size_t batch)
+{
+    const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0;
+    switch (prior) {
+    case  0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break;
+    case  1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break;
+    case  2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break;
+    case  3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break;
+    case  4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break;
+    case  5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break;
+    case  6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break;
+    case  7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break;
+    default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break;
+    }
+}
+template<unsigned N>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::wait_prior()
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<N>();
+}
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::arrive_on(awbarrier& barrier)
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier);
+}
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier)
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast<uint64_t *>(&barrier));
+}
+# endif
+template<class T>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T& dst, const T& src, pipeline& pipe)
+{
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&src) & (alignof(T) - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&dst) & (alignof(T) - 1)));
+    if (__is_trivially_copyable(T)) {
+        _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed<sizeof(T), alignof(T)>(
+                reinterpret_cast<void*>(&dst), reinterpret_cast<const void*>(&src));
+    } else {
+        dst = src;
+    }
+}
+template<class T, size_t DstN, size_t SrcN>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe)
+{
+    constexpr size_t dst_size = sizeof(*dst);
+    constexpr size_t src_size = sizeof(*src);
+    static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size.");
+    static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size.");
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (dst_size - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (dst_size - 1)));
+    if (__is_trivially_copyable(T)) {
+        _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict<sizeof(*dst), sizeof(*src)>(
+                reinterpret_cast<void*>(*dst), reinterpret_cast<const void*>(*src));
+    } else {
+        for (size_t i = 0; i < DstN; ++i) {
+            (*dst)[i] = (i < SrcN) ? (*src)[i] : T();
+        }
+    }
+}
+_CUDA_PIPELINE_END_NAMESPACE
+#endif /* !_CUDA_PIPELINE_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h ADDED Viewed

	@@ -0,0 +1,109 @@

+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_TEXTURE_TYPES_H__)
+#define __CUDA_TEXTURE_TYPES_H__
+#if defined(__cplusplus) && defined(__CUDACC__)
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "cuda_runtime_api.h"
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+template<class T, int texType = cudaTextureType1D, enum cudaTextureReadMode mode = cudaReadModeElementType>
+struct __device_builtin_texture_type__ texture : public textureReference
+{
+#if !defined(__CUDACC_RTC__)
+  __host__ texture(int                         norm  = 0,
+                   enum cudaTextureFilterMode  fMode = cudaFilterModePoint,
+                   enum cudaTextureAddressMode aMode = cudaAddressModeClamp)
+  {
+    normalized     = norm;
+    filterMode     = fMode;
+    addressMode[0] = aMode;
+    addressMode[1] = aMode;
+    addressMode[2] = aMode;
+    channelDesc    = cudaCreateChannelDesc<T>();
+    sRGB           = 0;
+  }
+  __host__ texture(int                          norm,
+                   enum cudaTextureFilterMode   fMode,
+                   enum cudaTextureAddressMode  aMode,
+                   struct cudaChannelFormatDesc desc)
+  {
+    normalized     = norm;
+    filterMode     = fMode;
+    addressMode[0] = aMode;
+    addressMode[1] = aMode;
+    addressMode[2] = aMode;
+    channelDesc    = desc;
+    sRGB           = 0;
+  }
+#endif /* !__CUDACC_RTC__ */
+};
+#endif /* __cplusplus && __CUDACC__ */
+#endif /* !__CUDA_TEXTURE_TYPES_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp ADDED Viewed

	@@ -0,0 +1,224 @@

+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
+#define __DEVICE_ATOMIC_FUNCTIONS_HPP__
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
+{
+  return __iAtomicAdd(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
+{
+  return __iAtomicAdd(address, (unsigned int)-(int)val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd(address, (unsigned int)-(int)val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
+{
+  return __iAtomicExch(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
+{
+  return __fAtomicExch(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
+{
+  return __iAtomicMin(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
+{
+  return __iAtomicMax(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
+{
+  return __iAtomicAnd(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
+{
+  return __iAtomicOr(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
+{
+  return __iAtomicXor(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
+{
+  return __iAtomicCAS(address, compare, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
+{
+  return __uAtomicCAS(address, compare, val);
+}
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
+{
+  return __ullAtomicAdd(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
+{
+  return __ullAtomicExch(address, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
+{
+  return __ullAtomicCAS(address, compare, val);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
+{
+  return (bool)__any((int)cond);
+}
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
+{
+  return (bool)__all((int)cond);
+}
+#endif /* __cplusplus && __CUDACC__ */
+#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
+#endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h ADDED Viewed

	@@ -0,0 +1,65 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("device_double_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "device_double_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
+#endif
+#include "crt/device_double_functions.h"
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h ADDED Viewed

	@@ -0,0 +1,118 @@

+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__DEVICE_LAUNCH_PARAMETERS_H__)
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+#include "vector_types.h"
+#if !defined(__STORAGE__)
+#if defined(__CUDACC_RTC__)
+#define __STORAGE__ \
+        extern const __device__
+#else /* !__CUDACC_RTC__ */
+#define __STORAGE__ \
+        extern const
+#endif /* __CUDACC_RTC__ */
+#endif /* __STORAGE__ */
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+uint3 __device_builtin__ __STORAGE__ threadIdx;
+uint3 __device_builtin__ __STORAGE__ blockIdx;
+dim3 __device_builtin__ __STORAGE__ blockDim;
+dim3 __device_builtin__ __STORAGE__ gridDim;
+int __device_builtin__ __STORAGE__ warpSize;
+#undef __STORAGE__
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#if !defined(__cudaGet_threadIdx)
+#define __cudaGet_threadIdx() \
+        threadIdx
+#endif /* __cudaGet_threadIdx */
+#if !defined(__cudaGet_blockIdx)
+#define __cudaGet_blockIdx() \
+        blockIdx
+#endif /* __cudaGet_blockIdx */
+#if !defined(__cudaGet_blockDim)
+#define __cudaGet_blockDim() \
+        blockDim
+#endif /* __cudaGet_blockDim */
+#if !defined(__cudaGet_gridDim)
+#define __cudaGet_gridDim() \
+        gridDim
+#endif /* __cudaGet_gridDim */
+#if !defined(__cudaGet_warpSize)
+#define __cudaGet_warpSize() \
+        warpSize
+#endif /* __cudaGet_warpSize */
+#endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h ADDED Viewed

	@@ -0,0 +1,145 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__DRIVER_FUNCTIONS_H__)
+#define __DRIVER_FUNCTIONS_H__
+#include "builtin_types.h"
+#include "crt/host_defines.h"
+#include "driver_types.h"
+/**
+ * \addtogroup CUDART_MEMORY
+ *
+ * @{
+ */
+/**
+ * \brief Returns a cudaPitchedPtr based on input parameters
+ *
+ * Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
+ * \p p, \p xsz, and \p ysz.
+ *
+ * \param d   - Pointer to allocated memory
+ * \param p   - Pitch of allocated memory in bytes
+ * \param xsz - Logical width of allocation in elements
+ * \param ysz - Logical height of allocation in elements
+ *
+ * \return
+ * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
+ *
+ * \sa make_cudaExtent, make_cudaPos
+ */
+static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz)
+{
+  struct cudaPitchedPtr s;
+  s.ptr   = d;
+  s.pitch = p;
+  s.xsize = xsz;
+  s.ysize = ysz;
+  return s;
+}
+/**
+ * \brief Returns a cudaPos based on input parameters
+ *
+ * Returns a ::cudaPos based on the specified input parameters \p x,
+ * \p y, and \p z.
+ *
+ * \param x - X position
+ * \param y - Y position
+ * \param z - Z position
+ *
+ * \return
+ * ::cudaPos specified by \p x, \p y, and \p z
+ *
+ * \sa make_cudaExtent, make_cudaPitchedPtr
+ */
+static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z)
+{
+  struct cudaPos p;
+  p.x = x;
+  p.y = y;
+  p.z = z;
+  return p;
+}
+/**
+ * \brief Returns a cudaExtent based on input parameters
+ *
+ * Returns a ::cudaExtent based on the specified input parameters \p w,
+ * \p h, and \p d.
+ *
+ * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
+ * \param h - Height in elements
+ * \param d - Depth in elements
+ *
+ * \return
+ * ::cudaExtent specified by \p w, \p h, and \p d
+ *
+ * \sa make_cudaPitchedPtr, make_cudaPos
+ */
+static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d)
+{
+  struct cudaExtent e;
+  e.width  = w;
+  e.height = h;
+  e.depth  = d;
+  return e;
+}
+/** @} */ /* END CUDART_MEMORY */
+#endif /* !__DRIVER_FUNCTIONS_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_types.h ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_config.h ADDED Viewed

	@@ -0,0 +1,65 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("host_config.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "host_config.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
+#endif
+#include "crt/host_config.h"
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/library_types.h ADDED Viewed

	@@ -0,0 +1,103 @@

+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__LIBRARY_TYPES_H__)
+#define __LIBRARY_TYPES_H__
+typedef enum cudaDataType_t
+{
+    CUDA_R_16F  =  2, /* real as a half */
+    CUDA_C_16F  =  6, /* complex as a pair of half numbers */
+    CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
+    CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
+    CUDA_R_32F  =  0, /* real as a float */
+    CUDA_C_32F  =  4, /* complex as a pair of float numbers */
+    CUDA_R_64F  =  1, /* real as a double */
+    CUDA_C_64F  =  5, /* complex as a pair of double numbers */
+    CUDA_R_4I   = 16, /* real as a signed 4-bit int */
+    CUDA_C_4I   = 17, /* complex as a pair of signed 4-bit int numbers */
+    CUDA_R_4U   = 18, /* real as a unsigned 4-bit int */
+    CUDA_C_4U   = 19, /* complex as a pair of unsigned 4-bit int numbers */
+    CUDA_R_8I   =  3, /* real as a signed 8-bit int */
+    CUDA_C_8I   =  7, /* complex as a pair of signed 8-bit int numbers */
+    CUDA_R_8U   =  8, /* real as a unsigned 8-bit int */
+    CUDA_C_8U   =  9, /* complex as a pair of unsigned 8-bit int numbers */
+    CUDA_R_16I  = 20, /* real as a signed 16-bit int */
+    CUDA_C_16I  = 21, /* complex as a pair of signed 16-bit int numbers */
+    CUDA_R_16U  = 22, /* real as a unsigned 16-bit int */
+    CUDA_C_16U  = 23, /* complex as a pair of unsigned 16-bit int numbers */
+    CUDA_R_32I  = 10, /* real as a signed 32-bit int */
+    CUDA_C_32I  = 11, /* complex as a pair of signed 32-bit int numbers */
+    CUDA_R_32U  = 12, /* real as a unsigned 32-bit int */
+    CUDA_C_32U  = 13, /* complex as a pair of unsigned 32-bit int numbers */
+    CUDA_R_64I  = 24, /* real as a signed 64-bit int */
+    CUDA_C_64I  = 25, /* complex as a pair of signed 64-bit int numbers */
+    CUDA_R_64U  = 26, /* real as a unsigned 64-bit int */
+    CUDA_C_64U  = 27, /* complex as a pair of unsigned 64-bit int numbers */
+    CUDA_R_8F_E4M3 = 28, /* real as a nv_fp8_e4m3 */
+    CUDA_R_8F_E5M2 = 29, /* real as a nv_fp8_e5m2 */
+} cudaDataType;
+typedef enum libraryPropertyType_t
+{
+    MAJOR_VERSION,
+    MINOR_VERSION,
+    PATCH_LEVEL
+} libraryPropertyType;
+#ifndef __cplusplus
+typedef enum cudaDataType_t cudaDataType_t;
+typedef enum libraryPropertyType_t libraryPropertyType_t;
+#endif
+#endif /* !__LIBRARY_TYPES_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_functions.h ADDED Viewed

	@@ -0,0 +1,65 @@

+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("math_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "math_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
+#endif
+#include "crt/math_functions.h"
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
+#endif

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_32_atomic_functions.hpp ADDED Viewed

	@@ -0,0 +1,134 @@

+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_32_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_32_ATOMIC_FUNCTIONS_HPP__
+#if defined(__CUDACC_RTC__)
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val)
+{
+    return __illAtomicMin(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val)
+{
+    return __illAtomicMax(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val)
+{
+    return __llAtomicAnd(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val)
+{
+    return __llAtomicOr(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val)
+{
+    return __llAtomicXor(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicMin(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicMax(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicAnd(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicOr(address, val);
+}
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicXor(address, val);
+}
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+#endif /* __cplusplus && __CUDACC__ */
+#undef __SM_32_ATOMIC_FUNCTIONS_DECL__
+#endif /* !__SM_32_ATOMIC_FUNCTIONS_HPP__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_60_atomic_functions.hpp ADDED Viewed

	@@ -0,0 +1,527 @@

+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SM_60_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_60_ATOMIC_FUNCTIONS_HPP__
+#if defined(__CUDACC_RTC__)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+#if defined(__cplusplus) && defined(__CUDACC__)
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val)
+{
+  return __dAtomicAdd(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_block(int *address, int val)
+{
+  return __iAtomicAdd_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_system(int *address, int val)
+{
+  return __iAtomicAdd_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAdd_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAdd_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_block(float *address, float val)
+{
+  return __fAtomicAdd_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_system(float *address, float val)
+{
+  return __fAtomicAdd_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_block(double *address, double val)
+{
+  return __dAtomicAdd_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_system(double *address, double val)
+{
+  return __dAtomicAdd_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_block(int *address, int val)
+{
+  return __iAtomicAdd_block(address, (unsigned int)-(int)val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_system(int *address, int val)
+{
+  return __iAtomicAdd_system(address, (unsigned int)-(int)val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_block(address, (unsigned int)-(int)val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_system(address, (unsigned int)-(int)val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_block(int *address, int val)
+{
+  return __iAtomicExch_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_system(int *address, int val)
+{
+  return __iAtomicExch_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicExch_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicExch_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_block(float *address, float val)
+{
+  return __fAtomicExch_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_system(float *address, float val)
+{
+  return __fAtomicExch_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_block(int *address, int val)
+{
+  return __iAtomicMin_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_system(int *address, int val)
+{
+  return __iAtomicMin_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_block(long long *address, long long val)
+{
+  return __illAtomicMin_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_system(long long *address, long long val)
+{
+  return __illAtomicMin_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMin_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMin_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_block(int *address, int val)
+{
+  return __iAtomicMax_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_system(int *address, int val)
+{
+  return __iAtomicMax_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_block(long long *address, long long val)
+{
+  return __illAtomicMax_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_system(long long *address, long long val)
+{
+  return __illAtomicMax_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMax_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMax_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_block(int *address, int compare, int val)
+{
+  return __iAtomicCAS_block(address, compare, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_system(int *address, int compare, int val)
+{
+  return __iAtomicCAS_system(address, compare, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
+                             unsigned int val)
+{
+  return __uAtomicCAS_block(address, compare, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
+                              unsigned int val)
+{
+  return __uAtomicCAS_system(address, compare, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_block(unsigned long long int *address,
+                                       unsigned long long int compare,
+                                       unsigned long long int val)
+{
+  return __ullAtomicCAS_block(address, compare, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_system(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val)
+{
+  return __ullAtomicCAS_system(address, compare, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_block(int *address, int val)
+{
+  return __iAtomicAnd_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_system(int *address, int val)
+{
+  return __iAtomicAnd_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_block(long long *address, long long val)
+{
+  return __llAtomicAnd_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_system(long long *address, long long val)
+{
+  return __llAtomicAnd_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAnd_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAnd_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_block(int *address, int val)
+{
+  return __iAtomicOr_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_system(int *address, int val)
+{
+  return __iAtomicOr_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_block(long long *address, long long val)
+{
+  return __llAtomicOr_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_system(long long *address, long long val)
+{
+  return __llAtomicOr_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicOr_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicOr_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_block(int *address, int val)
+{
+  return __iAtomicXor_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_system(int *address, int val)
+{
+  return __iAtomicXor_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_block(long long *address, long long val)
+{
+  return __llAtomicXor_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_system(long long *address, long long val)
+{
+  return __llAtomicXor_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor_system(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicXor_block(address, val);
+}
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicXor_system(address, val);
+}
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
+#endif /* __cplusplus && __CUDACC__ */
+#undef __SM_60_ATOMIC_FUNCTIONS_DECL__
+#endif /* !__SM_60_ATOMIC_FUNCTIONS_HPP__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_functions.h ADDED Viewed

	@@ -0,0 +1,439 @@

+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__SURFACE_FUNCTIONS_H__)
+#define __SURFACE_FUNCTIONS_H__
+#if defined(__cplusplus) && defined(__CUDACC__)
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+#include "cuda_surface_types.h"
+#if defined(_WIN32)
+# define __DEPRECATED__ __declspec(deprecated)
+#else
+# define __DEPRECATED__  __attribute__((deprecated))
+#endif
+#ifdef __CUDA_ARCH__
+template <typename T> struct __nv_surf_trait {  typedef void * cast_type; };
+template<> struct __nv_surf_trait<char> {  typedef char * cast_type; };
+template<> struct __nv_surf_trait<signed char> {  typedef signed char * cast_type; };
+template<> struct __nv_surf_trait<unsigned char> {  typedef unsigned char * cast_type; };
+template<> struct __nv_surf_trait<char1> {  typedef char1 * cast_type; };
+template<> struct __nv_surf_trait<uchar1> {  typedef uchar1 * cast_type; };
+template<> struct __nv_surf_trait<char2> {  typedef char2 * cast_type; };
+template<> struct __nv_surf_trait<uchar2> {  typedef uchar2 * cast_type; };
+template<> struct __nv_surf_trait<char4> {  typedef char4 * cast_type; };
+template<> struct __nv_surf_trait<uchar4> {  typedef uchar4 * cast_type; };
+template<> struct __nv_surf_trait<short> {  typedef short * cast_type; };
+template<> struct __nv_surf_trait<unsigned short> {  typedef unsigned short * cast_type; };
+template<> struct __nv_surf_trait<short1> {  typedef short1 * cast_type; };
+template<> struct __nv_surf_trait<ushort1> {  typedef ushort1 * cast_type; };
+template<> struct __nv_surf_trait<short2> {  typedef short2 * cast_type; };
+template<> struct __nv_surf_trait<ushort2> {  typedef ushort2 * cast_type; };
+template<> struct __nv_surf_trait<short4> {  typedef short4 * cast_type; };
+template<> struct __nv_surf_trait<ushort4> {  typedef ushort4 * cast_type; };
+template<> struct __nv_surf_trait<int> {  typedef int * cast_type; };
+template<> struct __nv_surf_trait<unsigned int> {  typedef unsigned int * cast_type; };
+template<> struct __nv_surf_trait<int1> {  typedef int1 * cast_type; };
+template<> struct __nv_surf_trait<uint1> {  typedef uint1 * cast_type; };
+template<> struct __nv_surf_trait<int2> {  typedef int2 * cast_type; };
+template<> struct __nv_surf_trait<uint2> {  typedef uint2 * cast_type; };
+template<> struct __nv_surf_trait<int4> {  typedef int4 * cast_type; };
+template<> struct __nv_surf_trait<uint4> {  typedef uint4 * cast_type; };
+template<> struct __nv_surf_trait<long long> {  typedef long long * cast_type; };
+template<> struct __nv_surf_trait<unsigned long long> {  typedef unsigned long long * cast_type; };
+template<> struct __nv_surf_trait<longlong1> {  typedef longlong1 * cast_type; };
+template<> struct __nv_surf_trait<ulonglong1> {  typedef ulonglong1 * cast_type; };
+template<> struct __nv_surf_trait<longlong2> {  typedef longlong2 * cast_type; };
+template<> struct __nv_surf_trait<ulonglong2> {  typedef ulonglong2 * cast_type; };
+#if !defined(__LP64__)
+template<> struct __nv_surf_trait<long> {  typedef int * cast_type; };
+template<> struct __nv_surf_trait<unsigned long> {  typedef unsigned int * cast_type; };
+template<> struct __nv_surf_trait<long1> {  typedef int1 * cast_type; };
+template<> struct __nv_surf_trait<ulong1> {  typedef uint1 * cast_type; };
+template<> struct __nv_surf_trait<long2> {  typedef int2 * cast_type; };
+template<> struct __nv_surf_trait<ulong2> {  typedef uint2 * cast_type; };
+template<> struct __nv_surf_trait<long4> {  typedef uint4 * cast_type; };
+template<> struct __nv_surf_trait<ulong4> {  typedef int4 * cast_type; };
+#endif
+template<> struct __nv_surf_trait<float> {  typedef float * cast_type; };
+template<> struct __nv_surf_trait<float1> {  typedef float1 * cast_type; };
+template<> struct __nv_surf_trait<float2> {  typedef float2 * cast_type; };
+template<> struct __nv_surf_trait<float4> {  typedef float4 * cast_type; };
+#endif /* defined(__CUDA_ARCH__) */
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf1Dread(T *res, surface<void, cudaSurfaceType1D> surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1Dread_v2", (void *)res, s, surf, x, mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf1Dread(surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf1Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, mode);
+  return temp;
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1Dread(T *res, surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  *res = surf1Dread<T>(surf, x, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf2Dread(T *res, surface<void, cudaSurfaceType2D> surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2Dread_v2", (void *)res, s, surf, x, y, mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf2Dread(surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf2Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, mode);
+  return temp;
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2Dread(T *res, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  *res = surf2Dread<T>(surf, x, y, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf3Dread(T *res, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf3Dread_v2", (void *)res, s, surf, x, y, z, mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf3Dread(surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf3Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, z, mode);
+  return temp;
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf3Dread(T *res, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  *res = surf3Dread<T>(surf, x, y, z, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf1DLayeredread(T *res, surface<void, cudaSurfaceType1DLayered> surf, int x, int  layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1DLayeredread_v2", (void *)res, s, surf, x,  layer, mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf1DLayeredread(surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf1DLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, layer, mode);
+  return temp;
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredread(T *res, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  *res = surf1DLayeredread<T>(surf, x, layer, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf2DLayeredread(T *res, surface<void, cudaSurfaceType2DLayered> surf, int x,  int y, int  layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2DLayeredread_v2", (void *)res, s, surf, x, y, layer, mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf2DLayeredread(surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf2DLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, layer, mode);
+  return temp;
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredread(T *res, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  *res = surf2DLayeredread<T>(surf, x, y, layer, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __device__  __forceinline__ void surfCubemapread(T *res, surface<void, cudaSurfaceTypeCubemap> surf, int x,  int y, int  face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapread_v2", (void *)res, s, surf, x, y, face, mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surfCubemapread(surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surfCubemapread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, face, mode);
+  return temp;
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapread(T *res, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  *res = surfCubemapread<T>(surf, x, y, face, mode);
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surfCubemapLayeredread(T *res, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x,  int y, int  layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapLayeredread_v2", (void *)res, s, surf, x, y, layerFace, mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surfCubemapLayeredread(surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surfCubemapLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, layerFace, mode);
+  return temp;
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredread(T *res, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  *res = surfCubemapLayeredread<T>(surf, x, y, layerFace, mode);
+#endif /* __CUDA_ARCH__ */
+}
+//surf1Dwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface<void, cudaSurfaceType1D> surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1Dwrite_v2", (void *)&val, s, surf, x, mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x,  mode);
+#endif /* __CUDA_ARCH__ */
+}
+//surf2Dwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2Dwrite_v2", (void *)&val,  s, surf, x, y, mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y,  mode);
+#endif /* __CUDA_ARCH__ */
+}
+//surf3Dwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf3Dwrite_v2", (void *)&val,  s, surf, x, y, z,mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf3Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, z,  mode);
+#endif /* __CUDA_ARCH__ */
+}
+//surf1DLayeredwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1DLayeredwrite_v2", (void *)&val,  s, surf, x, layer,mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1DLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val,  (int)sizeof(T), surf, x, layer, mode);
+#endif /* __CUDA_ARCH__ */
+}
+//surf2DLayeredwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2DLayeredwrite_v2", (void *)&val, s, surf, x, y, layer,mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2DLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val,  (int)sizeof(T), surf, x, y, layer, mode);
+#endif /* __CUDA_ARCH__ */
+}
+//surfCubemapwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapwrite_v2", (void *)&val, s, surf, x, y, face, mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, face,  mode);
+#endif /* __CUDA_ARCH__ */
+}
+//surfCubemapLayeredwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (void *)&val, s, surf, x, y, layerFace,  mode);
+#endif
+}
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, layerFace,  mode);
+#endif /* __CUDA_ARCH__ */
+}
+#undef __DEPRECATED__
+#endif /* __cplusplus && __CUDACC__ */
+#endif /* !__SURFACE_FUNCTIONS_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_fetch_functions.h ADDED Viewed

	@@ -0,0 +1,739 @@

+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(__TEXTURE_FETCH_FUNCTIONS_H__)
+#define __TEXTURE_FETCH_FUNCTIONS_H__
+#if defined(__cplusplus) && defined(__CUDACC__)
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+#include "cuda_runtime_api.h"
+#include "cuda_texture_types.h"
+#if defined(_WIN32)
+# define __DEPRECATED__ __declspec(deprecated)
+#else
+# define __DEPRECATED__  __attribute__((deprecated))
+#endif
+template <typename T>
+struct __nv_tex_rmet_ret { };
+template<> struct __nv_tex_rmet_ret<char> { typedef char type; };
+template<> struct __nv_tex_rmet_ret<signed char> { typedef signed char type; };
+template<> struct __nv_tex_rmet_ret<unsigned char> { typedef unsigned char type; };
+template<> struct __nv_tex_rmet_ret<char1> { typedef char1 type; };
+template<> struct __nv_tex_rmet_ret<uchar1> { typedef uchar1 type; };
+template<> struct __nv_tex_rmet_ret<char2> { typedef char2 type; };
+template<> struct __nv_tex_rmet_ret<uchar2> { typedef uchar2 type; };
+template<> struct __nv_tex_rmet_ret<char4> { typedef char4 type; };
+template<> struct __nv_tex_rmet_ret<uchar4> { typedef uchar4 type; };
+template<> struct __nv_tex_rmet_ret<short> { typedef short type; };
+template<> struct __nv_tex_rmet_ret<unsigned short> { typedef unsigned short type; };
+template<> struct __nv_tex_rmet_ret<short1> { typedef short1 type; };
+template<> struct __nv_tex_rmet_ret<ushort1> { typedef ushort1 type; };
+template<> struct __nv_tex_rmet_ret<short2> { typedef short2 type; };
+template<> struct __nv_tex_rmet_ret<ushort2> { typedef ushort2 type; };
+template<> struct __nv_tex_rmet_ret<short4> { typedef short4 type; };
+template<> struct __nv_tex_rmet_ret<ushort4> { typedef ushort4 type; };
+template<> struct __nv_tex_rmet_ret<int> { typedef int type; };
+template<> struct __nv_tex_rmet_ret<unsigned int> { typedef unsigned int type; };
+template<> struct __nv_tex_rmet_ret<int1> { typedef int1 type; };
+template<> struct __nv_tex_rmet_ret<uint1> { typedef uint1 type; };
+template<> struct __nv_tex_rmet_ret<int2> { typedef int2 type; };
+template<> struct __nv_tex_rmet_ret<uint2> { typedef uint2 type; };
+template<> struct __nv_tex_rmet_ret<int4> { typedef int4 type; };
+template<> struct __nv_tex_rmet_ret<uint4> { typedef uint4 type; };
+#if !defined(__LP64__)
+template<> struct __nv_tex_rmet_ret<long> { typedef long type; };
+template<> struct __nv_tex_rmet_ret<unsigned long> { typedef unsigned long type; };
+template<> struct __nv_tex_rmet_ret<long1> { typedef long1 type; };
+template<> struct __nv_tex_rmet_ret<ulong1> { typedef ulong1 type; };
+template<> struct __nv_tex_rmet_ret<long2> { typedef long2 type; };
+template<> struct __nv_tex_rmet_ret<ulong2> { typedef ulong2 type; };
+template<> struct __nv_tex_rmet_ret<long4> { typedef long4 type; };
+template<> struct __nv_tex_rmet_ret<ulong4> { typedef ulong4 type; };
+#endif /* !__LP64__ */
+template<> struct __nv_tex_rmet_ret<float> { typedef float type; };
+template<> struct __nv_tex_rmet_ret<float1> { typedef float1 type; };
+template<> struct __nv_tex_rmet_ret<float2> { typedef float2 type; };
+template<> struct __nv_tex_rmet_ret<float4> { typedef float4 type; };
+template <typename T> struct __nv_tex_rmet_cast { typedef T* type;  };
+#if !defined(__LP64__)
+template<> struct __nv_tex_rmet_cast<long> { typedef int *type; };
+template<> struct __nv_tex_rmet_cast<unsigned long> { typedef unsigned int *type; };
+template<> struct __nv_tex_rmet_cast<long1> { typedef int1 *type; };
+template<> struct __nv_tex_rmet_cast<ulong1> { typedef uint1 *type; };
+template<> struct __nv_tex_rmet_cast<long2> { typedef int2 *type; };
+template<> struct __nv_tex_rmet_cast<ulong2> { typedef uint2 *type; };
+template<> struct __nv_tex_rmet_cast<long4> { typedef int4 *type; };
+template<> struct __nv_tex_rmet_cast<ulong4> { typedef uint4 *type; };
+#endif /* !__LP64__ */
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__  typename __nv_tex_rmet_ret<T>::type tex1Dfetch(texture<T, cudaTextureType1D, cudaReadModeElementType> t, int x)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1Dfetch_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x);
+  return temp;
+#endif
+}
+template <typename T>
+struct __nv_tex_rmnf_ret { };
+template <> struct __nv_tex_rmnf_ret<char> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<signed char> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<unsigned char> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<short> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<unsigned short> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<char1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<uchar1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<short1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<ushort1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<char2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<uchar2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<short2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<ushort2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<char4> { typedef float4 type; };
+template <> struct __nv_tex_rmnf_ret<uchar4> { typedef float4 type; };
+template <> struct __nv_tex_rmnf_ret<short4> { typedef float4 type; };
+template <> struct __nv_tex_rmnf_ret<ushort4> { typedef float4 type; };
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1Dfetch(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, int x)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1Dfetch_rmnf_v2", &type_dummy, &retval, t, x);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// tex1D
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1D(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1D(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1D_rmnf_v2", &type_dummy, &retval, t, x);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+//tex2D
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2D(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2D(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2D_rmnf_v2", &type_dummy, &retval, t, x, y);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+//tex1DLayered
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayered(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DLayered_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, layer);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayered(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DLayered_rmnf_v2", &type_dummy, &retval, t, x, layer);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+//tex2DLayered
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayered(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DLayered_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, layer);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayered(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DLayered_rmnf_v2", &type_dummy, &retval, t, x, y, layer);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// tex3D
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3D(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex3D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, z);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3D(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex3D_rmnf_v2", &type_dummy, &retval, t, x, y, z);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// texCubemap
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemap(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemap_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, z);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemap(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemap_rmnf_v2", &type_dummy, &retval, t, x, y, z);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+template <typename T>
+struct __nv_tex2dgather_ret { };
+template <> struct __nv_tex2dgather_ret<char> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<signed char> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char1> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char2> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char3> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char4> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<unsigned char> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar1> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar2> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar3> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar4> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<short> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short1> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short2> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short3> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short4> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<unsigned short> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort1> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort2> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort3> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort4> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<int> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int1> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int2> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int3> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int4> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<unsigned int> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint1> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint2> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint3> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint4> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<float> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float1> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float2> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float3> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float4> { typedef float4 type; };
+template <typename T>
+static __device__ __forceinline__ typename __nv_tex2dgather_ret<T>::type tex2Dgather(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, int comp=0)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex2dgather_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2Dgather_v2", &type_dummy, &retval, t, x, y, comp);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+template<typename T> struct __nv_tex2dgather_rmnf_ret { };
+template<> struct __nv_tex2dgather_rmnf_ret<char> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<signed char> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<unsigned char> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char4> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar4> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<signed short> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<unsigned short> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short4> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort4> { typedef float4 type; };
+template <typename T>
+static __device__ __forceinline__  typename __nv_tex2dgather_rmnf_ret<T>::type tex2Dgather(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, int comp = 0)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex2dgather_rmnf_ret<T>::type  retval;
+  __nv_tex_surf_handler("__tex2Dgather_rmnf_v2", &type_dummy, &retval, t, x, y, comp);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// tex1DLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLod(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x, float level)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, level);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLod(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x, float level)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DLod_rmnf_v2", &type_dummy, &retval, t, x, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// tex2DLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLod(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, float level)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, level);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLod(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, float level)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DLod_rmnf_v2", &type_dummy, &retval, t, x, y, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// tex1DLayeredLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayeredLod(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, layer, level);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayeredLod(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, layer, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// tex2DLayeredLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayeredLod(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, layer, level);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayeredLod(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, layer, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// tex3DLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3DLod(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z, float level)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex3DLod_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, level);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3DLod(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z, float level)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex3DLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// texCubemapLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLod(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z, float level)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapLod_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, level);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLod(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z, float level)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// texCubemapLayered
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayered(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapLayered_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayered(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapLayered_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// texCubemapLayeredLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayeredLod(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer, level);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayeredLod(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer, float level)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// texCubemapGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapGrad(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapGrad(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapGrad_rmnf_v2", &type_dummy, &retval, t,  x, y, z, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// texCubemapLayeredGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayeredGrad(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapLayeredGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayeredGrad(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, layer, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// tex1DGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DGrad(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x, float dPdx, float dPdy)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, dPdx, dPdy);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DGrad(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x, float dPdx, float dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DGrad_rmnf_v2", &type_dummy, &retval,t, x,dPdx, dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// tex2DGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DGrad(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, float2 dPdx, float2 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DGrad(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, float2 dPdx, float2 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// tex1DLayeredGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayeredGrad(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer, float dPdx, float dPdy)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DLayeredGrad_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, layer, dPdx, dPdy);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayeredGrad(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer, float dPdx, float dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, layer, dPdx, dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// tex2DLayeredGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayeredGrad(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DLayeredGrad_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, layer, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayeredGrad(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, layer, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+// tex3DGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3DGrad(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex3DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3DGrad(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex3DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+#undef __DEPRECATED__
+#endif /* __cplusplus && __CUDACC__ */
+#endif /* !__TEXTURE_FETCH_FUNCTIONS_H__ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (224 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h ADDED Viewed

	@@ -0,0 +1,78 @@

+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cudnn : Neural Networks Library
+*/
+#if !defined(CUDNN_H_)
+#define CUDNN_H_
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+#include "cudnn_ops_train.h"
+#include "cudnn_adv_infer.h"
+#include "cudnn_adv_train.h"
+#include "cudnn_cnn_infer.h"
+#include "cudnn_cnn_train.h"
+#include "cudnn_backend.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer.h ADDED Viewed

	@@ -0,0 +1,658 @@

+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cudnn_adv_infer : cuDNN's advanced and experimental features.
+*/
+#if !defined(CUDNN_ADV_INFER_H_)
+#define CUDNN_ADV_INFER_H_
+#include <cuda_runtime.h>
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_ADV_INFER_MAJOR 8
+#define CUDNN_ADV_INFER_MINOR 7
+#define CUDNN_ADV_INFER_PATCH 0
+#if (CUDNN_ADV_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_INFER_MINOR != CUDNN_MINOR) || \
+    (CUDNN_ADV_INFER_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN ADV INFER!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* BASIC RNN API */
+typedef enum {
+    CUDNN_FWD_MODE_INFERENCE = 0,
+    CUDNN_FWD_MODE_TRAINING  = 1,
+} cudnnForwardMode_t;
+typedef enum {
+    CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
+    CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
+    CUDNN_LSTM     = 2, /* LSTM with optional recurrent projection and clipping */
+    CUDNN_GRU      = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
+} cudnnRNNMode_t;
+typedef enum {
+    CUDNN_RNN_NO_BIAS         = 0, /* rnn cell formulas do not use biases */
+    CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
+    CUDNN_RNN_DOUBLE_BIAS     = 2, /* default, rnn cell formulas use two bias vectors */
+    CUDNN_RNN_SINGLE_REC_BIAS = 3  /* rnn cell formulas use one recurrent bias in recurrent GEMM */
+} cudnnRNNBiasMode_t;
+typedef enum {
+    CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
+    CUDNN_BIDIRECTIONAL  = 1, /* output concatination at each layer */
+} cudnnDirectionMode_t;
+typedef enum {
+    CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
+    CUDNN_SKIP_INPUT   = 1, /* fixed identity matrix in the first layer input GEMM */
+} cudnnRNNInputMode_t;
+typedef enum {
+    CUDNN_RNN_CLIP_NONE   = 0, /* disables LSTM cell clipping */
+    CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
+} cudnnRNNClipMode_t;
+typedef enum {
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED   = 0, /* padded, outer stride from one time-step to the next */
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED     = 1, /* sequence length sorted and packed as in basic RNN api */
+    CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
+} cudnnRNNDataLayout_t;
+/* Legacy type for backward compatibility */
+typedef unsigned cudnnRNNPaddingMode_t;
+/* For auxFlags in cudnnSetRNNDescriptor_v8() and cudnnSetRNNPaddingMode() */
+#define CUDNN_RNN_PADDED_IO_DISABLED 0
+#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
+struct cudnnRNNStruct;
+typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
+struct cudnnPersistentRNNPlan;
+typedef struct cudnnPersistentRNNPlan *cudnnPersistentRNNPlan_t;
+struct cudnnRNNDataStruct;
+typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t algo,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNBiasMode_t biasMode,
+                         cudnnDirectionMode_t dirMode,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDataType_t dataType,
+                         cudnnDataType_t mathPrec,
+                         cudnnMathType_t mathType,
+                         int32_t inputSize,
+                         int32_t hiddenSize,
+                         int32_t projSize,
+                         int32_t numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         uint32_t auxFlags);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNBiasMode_t *biasMode,
+                         cudnnDirectionMode_t *dirMode,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDataType_t *dataType,
+                         cudnnDataType_t *mathPrec,
+                         cudnnMathType_t *mathType,
+                         int32_t *inputSize,
+                         int32_t *hiddenSize,
+                         int32_t *projSize,
+                         int32_t *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         uint32_t *auxFlags);
+/*
+ * mathPrec in cudnnSetRNNDescriptor_v6() specifies compute precision
+ * compute precision is further modified by cudnnSetRNNMatrixMathType()
+ * dataType in cudnnGetRNNParamsSize() and wDesc specify weight storage
+ * dropout is between RNN layers, not between recurrent steps
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
+                         cudnnRNNDescriptor_t rnnDesc,
+                         const int hiddenSize,
+                         const int numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDirectionMode_t direction,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNAlgo_t algo,
+                         cudnnDataType_t mathPrec);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v6(cudnnHandle_t handle,
+                         cudnnRNNDescriptor_t rnnDesc,
+                         int *hiddenSize,
+                         int *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDirectionMode_t *direction,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnDataType_t *mathPrec);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t biasMode);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t *biasMode);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t clipMode,
+                   cudnnNanPropagation_t clipNanOpt,
+                   double lclip,
+                   double rclip);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t *clipMode,
+                   cudnnNanPropagation_t *clipNanOpt,
+                   double *lclip,
+                   double *rclip);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t clipMode,
+                cudnnNanPropagation_t clipNanOpt,
+                double lclip,
+                double rclip);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t *clipMode,
+                cudnnNanPropagation_t *clipNanOpt,
+                double *lclip,
+                double *rclip);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
+                            cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize,
+                            const int outProjSize);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
+                            const cudnnRNNDescriptor_t rnnDesc,
+                            int *recProjSize,
+                            int *outProjSize);
+/* Expensive. Creates the plan for the specific settings. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
+                             const int minibatch,
+                             const cudnnDataType_t dataType,
+                             cudnnPersistentRNNPlan_t *plan);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan);
+cudnnStatus_t CUDNNWINAPI
+cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
+/* dataType in weight descriptors and input descriptors is used to describe storage */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         size_t *sizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
+                               const cudnnRNNDescriptor_t rnnDesc,
+                               const int seqLength,
+                               const cudnnTensorDescriptor_t *xDesc,
+                               size_t *sizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
+                          cudnnRNNDescriptor_t rnnDesc,
+                          cudnnForwardMode_t fMode,
+                          cudnnRNNDataDescriptor_t xDesc,
+                          size_t *workSpaceSize,
+                          size_t *reserveSpaceSize);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle,
+                      const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc,
+                      size_t *sizeInBytes,
+                      cudnnDataType_t dataType);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
+                                const cudnnRNNDescriptor_t rnnDesc,
+                                const int pseudoLayer,
+                                const cudnnTensorDescriptor_t xDesc,
+                                const cudnnFilterDescriptor_t wDesc,
+                                const void *w,
+                                const int linLayerID,
+                                cudnnFilterDescriptor_t linLayerMatDesc,
+                                void **linLayerMat);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
+                              const cudnnRNNDescriptor_t rnnDesc,
+                              const int pseudoLayer,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const cudnnFilterDescriptor_t wDesc,
+                              const void *w,
+                              const int linLayerID,
+                              cudnnFilterDescriptor_t linLayerBiasDesc,
+                              void **linLayerBias);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightParams(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        int32_t pseudoLayer,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        int32_t linLayerID,
+                        cudnnTensorDescriptor_t mDesc,
+                        void **mAddr,
+                        cudnnTensorDescriptor_t bDesc,
+                        void **bAddr);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInference(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         const void *x,
+                         const cudnnTensorDescriptor_t hxDesc,
+                         const void *hx,
+                         const cudnnTensorDescriptor_t cxDesc,
+                         const void *cx,
+                         const cudnnFilterDescriptor_t wDesc,
+                         const void *w,
+                         const cudnnTensorDescriptor_t *yDesc,
+                         void *y,
+                         const cudnnTensorDescriptor_t hyDesc,
+                         void *hy,
+                         const cudnnTensorDescriptor_t cyDesc,
+                         void *cy,
+                         void *workSpace,
+                         size_t workSpaceSizeInBytes);
+/* RNN EX API */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned paddingMode);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned *paddingMode);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill);         /* symbol for filling padding position in output */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
+                           const cudnnRNNDescriptor_t rnnDesc,
+                           const cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           const cudnnTensorDescriptor_t hxDesc,
+                           const void *hx,
+                           const cudnnTensorDescriptor_t cxDesc,
+                           const void *cx,
+                           const cudnnFilterDescriptor_t wDesc,
+                           const void *w,
+                           const cudnnRNNDataDescriptor_t yDesc,
+                           void *y,
+                           const cudnnTensorDescriptor_t hyDesc,
+                           void *hy,
+                           const cudnnTensorDescriptor_t cyDesc,
+                           void *cy,
+                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                           const void *keys,                     /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                           void *cAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                           void *iAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                           void *queries,                        /* reserved, should pass NULL */
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForward(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnForwardMode_t fwdMode,
+                const int32_t devSeqLengths[],
+                cudnnRNNDataDescriptor_t xDesc,
+                const void *x,
+                cudnnRNNDataDescriptor_t yDesc,
+                void *y,
+                cudnnTensorDescriptor_t hDesc,
+                const void *hx,
+                void *hy,
+                cudnnTensorDescriptor_t cDesc,
+                const void *cx,
+                void *cy,
+                size_t weightSpaceSize,
+                const void *weightSpace,
+                size_t workSpaceSize,
+                void *workSpace,
+                size_t reserveSpaceSize,
+                void *reserveSpace);
+/* RNN FIND API */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
+                                        const cudnnRNNDescriptor_t rnnDesc,
+                                        const int seqLength,
+                                        const cudnnTensorDescriptor_t *xDesc,
+                                        const void *x,
+                                        const cudnnTensorDescriptor_t hxDesc,
+                                        const void *hx,
+                                        const cudnnTensorDescriptor_t cxDesc,
+                                        const void *cx,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const void *w,
+                                        const cudnnTensorDescriptor_t *yDesc,
+                                        void *y,
+                                        const cudnnTensorDescriptor_t hyDesc,
+                                        void *hy,
+                                        const cudnnTensorDescriptor_t cyDesc,
+                                        void *cy,
+                                        const float findIntensity,
+                                        const int requestedAlgoCount,
+                                        int *returnedAlgoCount,
+                                        cudnnAlgorithmPerformance_t *perfResults,
+                                        void *workspace,
+                                        size_t workSpaceSizeInBytes);
+/* Sequence data descriptor */
+typedef enum {
+    CUDNN_SEQDATA_TIME_DIM  = 0, /* index in time */
+    CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
+    CUDNN_SEQDATA_BEAM_DIM  = 2, /* index in beam */
+    CUDNN_SEQDATA_VECT_DIM  = 3  /* index in vector */
+} cudnnSeqDataAxis_t;
+struct cudnnSeqDataStruct;
+typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t;
+#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t dataType,
+                          int nbDims,
+                          const int dimA[],
+                          const cudnnSeqDataAxis_t axes[],
+                          size_t seqLengthArraySize,
+                          const int seqLengthArray[],
+                          void *paddingFill);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t *dataType,
+                          int *nbDims,
+                          int nbDimsRequested,
+                          int dimA[],
+                          cudnnSeqDataAxis_t axes[],
+                          size_t *seqLengthArraySize,
+                          size_t seqLengthSizeRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+/* Multihead Attention */
+/* Legacy type for backward compatibility */
+typedef unsigned cudnnAttnQueryMap_t;
+/*
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
+ * Use the bitwise OR operator to combine several settings listed below.  Additional
+ * minor options can be added here w/o changing or introducing new API functions.
+ */
+#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0         /* multiple Q-s map to a single (K,V) set when beam size > 1 */
+#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
+#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0         /* no biases in attention input and output projections */
+#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1)  /* use biases in attention input and output projections */
+struct cudnnAttnStruct;
+typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned attnMode,
+                       int nHeads,
+                       double smScaler,
+                       cudnnDataType_t dataType,
+                       cudnnDataType_t computePrec,
+                       cudnnMathType_t mathType,
+                       cudnnDropoutDescriptor_t attnDropoutDesc,
+                       cudnnDropoutDescriptor_t postDropoutDesc,
+                       int qSize,
+                       int kSize,
+                       int vSize,
+                       int qProjSize,
+                       int kProjSize,
+                       int vProjSize,
+                       int oProjSize,
+                       int qoMaxSeqLength,
+                       int kvMaxSeqLength,
+                       int maxBatchSize,
+                       int maxBeamSize);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned *attnMode,
+                       int *nHeads,
+                       double *smScaler,
+                       cudnnDataType_t *dataType,
+                       cudnnDataType_t *computePrec,
+                       cudnnMathType_t *mathType,
+                       cudnnDropoutDescriptor_t *attnDropoutDesc,
+                       cudnnDropoutDescriptor_t *postDropoutDesc,
+                       int *qSize,
+                       int *kSize,
+                       int *vSize,
+                       int *qProjSize,
+                       int *kProjSize,
+                       int *vProjSize,
+                       int *oProjSize,
+                       int *qoMaxSeqLength,
+                       int *kvMaxSeqLength,
+                       int *maxBatchSize,
+                       int *maxBeamSize);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             size_t *weightSizeInBytes,
+                             size_t *workSpaceSizeInBytes,
+                             size_t *reserveSpaceSizeInBytes);
+typedef enum {
+    CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
+    CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
+    CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
+    CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
+    CUDNN_MH_ATTN_Q_BIASES  = 4, /* input projection bias tensor for 'queries' */
+    CUDNN_MH_ATTN_K_BIASES  = 5, /* input projection bias for 'keys' */
+    CUDNN_MH_ATTN_V_BIASES  = 6, /* input projection bias for 'values' */
+    CUDNN_MH_ATTN_O_BIASES  = 7, /* output projection biases */
+} cudnnMultiHeadAttnWeightKind_t;
+#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             cudnnMultiHeadAttnWeightKind_t wKind,
+                             size_t weightSizeInBytes,
+                             const void *weights,
+                             cudnnTensorDescriptor_t wDesc,
+                             void **wAddr);
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnForward(cudnnHandle_t handle,
+                          const cudnnAttnDescriptor_t attnDesc,
+                          int currIdx,
+                          const int loWinIdx[],
+                          const int hiWinIdx[],
+                          const int devSeqLengthsQO[],
+                          const int devSeqLengthsKV[],
+                          const cudnnSeqDataDescriptor_t qDesc,
+                          const void *queries,
+                          const void *residuals,
+                          const cudnnSeqDataDescriptor_t kDesc,
+                          const void *keys,
+                          const cudnnSeqDataDescriptor_t vDesc,
+                          const void *values,
+                          const cudnnSeqDataDescriptor_t oDesc,
+                          void *out,
+                          size_t weightSizeInBytes,
+                          const void *weights,
+                          size_t workSpaceSizeInBytes,
+                          void *workSpace,
+                          size_t reserveSpaceSizeInBytes,
+                          void *reserveSpace);
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvInferVersionCheck(void);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_ADV_INFER_H_ */

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h ADDED Viewed

The diff for this file is too large to render. See raw diff

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (220 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (220 Bytes). View file

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h ADDED Viewed

	@@ -0,0 +1,448 @@

+/*************************************************************************
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+#ifndef NCCL_H_
+#define NCCL_H_
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#if CUDART_VERSION >= 11000
+#include <cuda_bf16.h>
+#endif
+#define NCCL_MAJOR 2
+#define NCCL_MINOR 20
+#define NCCL_PATCH 5
+#define NCCL_SUFFIX ""
+#define NCCL_VERSION_CODE 22005
+#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <limits.h>
+/* Opaque handle to communicator */
+typedef struct ncclComm* ncclComm_t;
+#define NCCL_COMM_NULL NULL
+#define NCCL_UNIQUE_ID_BYTES 128
+typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+/* Error type */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6,
+               ncclInProgress              =  7,
+               ncclNumResults              =  8 } ncclResult_t;
+#define NCCL_CONFIG_UNDEF_INT INT_MIN
+#define NCCL_CONFIG_UNDEF_PTR NULL
+#define NCCL_SPLIT_NOCOLOR -1
+/* Communicator configuration. Users can assign value to attributes to specify the
+ * behavior of a communicator. */
+typedef struct ncclConfig_v21700 {
+  /* attributes that users should never touch. */
+  size_t size;
+  unsigned int magic;
+  unsigned int version;
+  /* attributes that users are able to customize. */
+  int blocking;
+  int cgaClusterSize;
+  int minCTAs;
+  int maxCTAs;
+  const char *netName;
+  int splitShare;
+} ncclConfig_t;
+/* Config initializer must be assigned to initialize config structure when it is created.
+ * Not initialized config will result in NCCL error. */
+#define NCCL_CONFIG_INITIALIZER {                                       \
+  sizeof(ncclConfig_t), /* size */                                      \
+  0xcafebeef,           /* magic */                                     \
+  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
+  NCCL_CONFIG_UNDEF_INT,                    /* blocking */              \
+  NCCL_CONFIG_UNDEF_INT,                    /* cgaClusterSize */        \
+  NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
+  NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
+  NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
+  NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
+}
+/* NCCL malloc and free function for all types of NCCL optimizations
+ * (e.g. user buffer registration). The actual allocated size might
+ * be larger than requested due to granularity requirement. */
+ncclResult_t  ncclMemAlloc(void** ptr, size_t size);
+ncclResult_t pncclMemAlloc(void** ptr, size_t size);
+ncclResult_t  ncclMemFree(void *ptr);
+ncclResult_t pncclMemFree(void *ptr);
+/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
+ * This integer is coded with the MAJOR, MINOR and PATCH level of the
+ * NCCL library
+ */
+ncclResult_t  ncclGetVersion(int *version);
+ncclResult_t pncclGetVersion(int *version);
+/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
+ * called once and the Id should be distributed to all ranks in the
+ * communicator before calling ncclCommInitRank. */
+ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
+ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
+/* Create a new communicator (multi thread/process version) with a configuration
+ * set by users. */
+ncclResult_t  ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+/* Creates a new communicator (multi thread/process version).
+ * rank must be between 0 and nranks-1 and unique within a communicator clique.
+ * Each rank is associated to a CUDA device, which has to be set before calling
+ * ncclCommInitRank.
+ * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
+ * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
+ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+/* Creates a clique of communicators (single process version).
+ * This is a convenience function to create a single-process communicator clique.
+ * Returns an array of ndev newly initialized communicators in comm.
+ * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
+ * If devlist is NULL, the first ndev CUDA devices are used.
+ * Order of devlist defines user-order of processors within the communicator. */
+ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
+ * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
+ * when the communicator is globally quiescent and related resources are freed; then,
+ * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
+ * itself) without blocking. */
+ncclResult_t  ncclCommFinalize(ncclComm_t comm);
+ncclResult_t pncclCommFinalize(ncclComm_t comm);
+/* Frees local resources associated with communicator object. */
+ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+ncclResult_t pncclCommDestroy(ncclComm_t comm);
+/* Frees resources associated with communicator object and aborts any operations
+ * that might still be running on the device. */
+ncclResult_t  ncclCommAbort(ncclComm_t comm);
+ncclResult_t pncclCommAbort(ncclComm_t comm);
+/* Creates one or more communicators from an existing one.
+ * Ranks with the same color will end up in the same communicator.
+ * Within the new communicator, key will be used to order ranks.
+ * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
+ * and will therefore return a NULL communicator.
+ * If config is NULL, the new communicator will inherit the original communicator's
+ * configuration*/
+ncclResult_t  ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+/* Returns a string for each error code. */
+const char*  ncclGetErrorString(ncclResult_t result);
+const char* pncclGetErrorString(ncclResult_t result);
+/* Returns a human-readable message of the last error that occurred. */
+const char*  ncclGetLastError(ncclComm_t comm);
+const char* pncclGetLastError(ncclComm_t comm);
+/* Checks whether the comm has encountered any asynchronous errors */
+ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+/* Gets the number of ranks in the communicator clique. */
+ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
+ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
+/* Returns the cuda device number associated with the communicator. */
+ncclResult_t  ncclCommCuDevice(const ncclComm_t comm, int* device);
+ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
+/* Returns the user-ordered "rank" associated with the communicator. */
+ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
+ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
+/* Register CUDA buffer for zero-copy operation */
+ncclResult_t  ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+/* Deregister CUDA buffer */
+ncclResult_t  ncclCommDeregister(const ncclComm_t comm, void* handle);
+ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
+/* Reduction operation selector */
+typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
+typedef enum { ncclSum        = 0,
+               ncclProd       = 1,
+               ncclMax        = 2,
+               ncclMin        = 3,
+               ncclAvg        = 4,
+               /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
+                * serves as the least possible value for dynamic ncclRedOp_t's
+                * as constructed by ncclRedOpCreate*** functions. */
+               ncclNumOps     = 5,
+               /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
+                * It is defined to be the largest signed value (since compilers
+                * are permitted to use signed enums) that won't grow
+                * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
+                * maintain ABI compatibility. */
+               ncclMaxRedOp   = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
+             } ncclRedOp_t;
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+               ncclBfloat16   = 9,
+               ncclNumTypes   = 10
+#else
+               ncclNumTypes   = 9
+#endif
+} ncclDataType_t;
+/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
+typedef enum {
+  /* ncclScalarDevice: The scalar is in device-visible memory and will be
+   * dereferenced while the collective is running. */
+  ncclScalarDevice = 0,
+  /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
+   * dereferenced before the ncclRedOpCreate***() function returns. */
+  ncclScalarHostImmediate = 1
+} ncclScalarResidence_t;
+/*
+ * ncclRedOpCreatePreMulSum
+ *
+ * Creates a new reduction operator which pre-multiplies input values by a given
+ * scalar locally before reducing them with peer values via summation. For use
+ * only with collectives launched against *comm* and *datatype*. The
+ * *residence* argument indicates how/when the memory pointed to by *scalar*
+ * will be dereferenced. Upon return, the newly created operator's handle
+ * is stored in *op*.
+ */
+ncclResult_t  ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+/*
+ * ncclRedOpDestroy
+ *
+ * Destroys the reduction operator *op*. The operator must have been created by
+ * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
+ * destroyed as soon as the last NCCL function which is given that operator returns.
+ */
+ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
+ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
+/*
+ * Collective communication operations
+ *
+ * Collective communication operations must be called separately for each
+ * communicator in a communicator clique.
+ *
+ * They return when operations have been enqueued on the CUDA stream.
+ *
+ * Since they may perform inter-CPU synchronization, each call has to be done
+ * from a different thread or process, or need to use Group Semantics (see
+ * below).
+ */
+/*
+ * Reduce
+ *
+ * Reduces data arrays of length count in sendbuff into recvbuff using op
+ * operation.
+ * recvbuff may be NULL on all calls except for root device.
+ * root is the rank (not the CUDA device) where data will reside after the
+ * operation is complete.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+/*
+ * (deprecated) Broadcast (in-place)
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * This operation is implicitely in place.
+ */
+ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+/*
+ * Broadcast
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+/*
+ * All-Reduce
+ *
+ * Reduces data arrays of length count in sendbuff using op operation, and
+ * leaves identical copies of result on each recvbuff.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+/*
+ * Reduce-Scatter
+ *
+ * Reduces data in sendbuff using op operation and leaves reduced result
+ * scattered over the devices so that recvbuff on rank i will contain the i-th
+ * block of the result.
+ * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
+ * should have a size of at least nranks*recvcount elements.
+ *
+ * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
+ */
+ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+/*
+ * All-Gather
+ *
+ * Each device gathers sendcount values from other GPUs into recvbuff,
+ * receiving data from rank i at offset i*sendcount.
+ * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
+ * should have a size of at least nranks*sendcount elements.
+ *
+ * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
+ */
+ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+/*
+ * Send
+ *
+ * Send data from sendbuff to rank peer.
+ *
+ * Rank peer needs to call ncclRecv with the same datatype and the same count from this
+ * rank.
+ *
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+ * ncclGroupEnd section.
+ */
+ncclResult_t  ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+/*
+ * Receive
+ *
+ * Receive data from rank peer into recvbuff.
+ *
+ * Rank peer needs to call ncclSend with the same datatype and the same count to this
+ * rank.
+ *
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+ * ncclGroupEnd section.
+ */
+ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t  ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+/*
+ * Group semantics
+ *
+ * When managing multiple GPUs from a single thread, and since NCCL collective
+ * calls may perform inter-CPU synchronization, we need to "group" calls for
+ * different ranks/devices into a single call.
+ *
+ * Grouping NCCL calls as being part of the same collective operation is done
+ * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
+ * collective calls until the ncclGroupEnd call, which will wait for all calls
+ * to be complete. Note that for collective communication, ncclGroupEnd only
+ * guarantees that the operations are enqueued on the streams, not that
+ * the operation is effectively done.
+ *
+ * Both collective communication and ncclCommInitRank can be used in conjunction
+ * of ncclGroupStart/ncclGroupEnd, but not together.
+ *
+ * Group semantics also allow to fuse multiple operations on the same device
+ * to improve performance (for aggregated collective calls), or to permit
+ * concurrent progress of multiple send/receive operations.
+ */
+/*
+ * Group Start
+ *
+ * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
+ * a single NCCL operation. Nothing will be started on the CUDA stream until
+ * ncclGroupEnd.
+ */
+ncclResult_t  ncclGroupStart();
+ncclResult_t pncclGroupStart();
+/*
+ * Group End
+ *
+ * End a group call. Start a fused NCCL operation consisting of all calls since
+ * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
+ * need to be called after ncclGroupEnd.
+ */
+ncclResult_t  ncclGroupEnd();
+ncclResult_t pncclGroupEnd();
+/* Register CUDA buffer for zero-copy operation */
+ncclResult_t  ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+/* Deregister CUDA buffer */
+ncclResult_t  ncclCommDeregister(const ncclComm_t comm, void* handle);
+ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+#endif // end include guard

tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__init__.py ADDED Viewed

File without changes

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/_cmd.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import logging
+from argparse import ArgumentParser
+from typing import TYPE_CHECKING
+from pip._vendor import requests
+from pip._vendor.cachecontrol.adapter import CacheControlAdapter
+from pip._vendor.cachecontrol.cache import DictCache
+from pip._vendor.cachecontrol.controller import logger
+if TYPE_CHECKING:
+    from argparse import Namespace
+    from pip._vendor.cachecontrol.controller import CacheController
+def setup_logging() -> None:
+    logger.setLevel(logging.DEBUG)
+    handler = logging.StreamHandler()
+    logger.addHandler(handler)
+def get_session() -> requests.Session:
+    adapter = CacheControlAdapter(
+        DictCache(), cache_etags=True, serializer=None, heuristic=None
+    )
+    sess = requests.Session()
+    sess.mount("http://", adapter)
+    sess.mount("https://", adapter)
+    sess.cache_controller = adapter.controller  # type: ignore[attr-defined]
+    return sess
+def get_args() -> Namespace:
+    parser = ArgumentParser()
+    parser.add_argument("url", help="The URL to try and cache")
+    return parser.parse_args()
+def main() -> None:
+    args = get_args()
+    sess = get_session()
+    # Make a request to get a response
+    resp = sess.get(args.url)
+    # Turn on logging
+    setup_logging()
+    # try setting the cache
+    cache_controller: CacheController = (
+        sess.cache_controller  # type: ignore[attr-defined]
+    )
+    cache_controller.cache_response(resp.request, resp.raw)
+    # Now try to get it
+    if cache_controller.cached_request(resp.request):
+        print("Cached!")
+    else:
+        print("Not cached :(")
+if __name__ == "__main__":
+    main()

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/adapter.py ADDED Viewed

	@@ -0,0 +1,161 @@

+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+import functools
+import types
+import zlib
+from typing import TYPE_CHECKING, Any, Collection, Mapping
+from pip._vendor.requests.adapters import HTTPAdapter
+from pip._vendor.cachecontrol.cache import DictCache
+from pip._vendor.cachecontrol.controller import PERMANENT_REDIRECT_STATUSES, CacheController
+from pip._vendor.cachecontrol.filewrapper import CallbackFileWrapper
+if TYPE_CHECKING:
+    from pip._vendor.requests import PreparedRequest, Response
+    from pip._vendor.urllib3 import HTTPResponse
+    from pip._vendor.cachecontrol.cache import BaseCache
+    from pip._vendor.cachecontrol.heuristics import BaseHeuristic
+    from pip._vendor.cachecontrol.serialize import Serializer
+class CacheControlAdapter(HTTPAdapter):
+    invalidating_methods = {"PUT", "PATCH", "DELETE"}
+    def __init__(
+        self,
+        cache: BaseCache | None = None,
+        cache_etags: bool = True,
+        controller_class: type[CacheController] | None = None,
+        serializer: Serializer | None = None,
+        heuristic: BaseHeuristic | None = None,
+        cacheable_methods: Collection[str] | None = None,
+        *args: Any,
+        **kw: Any,
+    ) -> None:
+        super().__init__(*args, **kw)
+        self.cache = DictCache() if cache is None else cache
+        self.heuristic = heuristic
+        self.cacheable_methods = cacheable_methods or ("GET",)
+        controller_factory = controller_class or CacheController
+        self.controller = controller_factory(
+            self.cache, cache_etags=cache_etags, serializer=serializer
+        )
+    def send(
+        self,
+        request: PreparedRequest,
+        stream: bool = False,
+        timeout: None | float | tuple[float, float] | tuple[float, None] = None,
+        verify: bool | str = True,
+        cert: (None | bytes | str | tuple[bytes | str, bytes | str]) = None,
+        proxies: Mapping[str, str] | None = None,
+        cacheable_methods: Collection[str] | None = None,
+    ) -> Response:
+        """
+        Send a request. Use the request information to see if it
+        exists in the cache and cache the response if we need to and can.
+        """
+        cacheable = cacheable_methods or self.cacheable_methods
+        if request.method in cacheable:
+            try:
+                cached_response = self.controller.cached_request(request)
+            except zlib.error:
+                cached_response = None
+            if cached_response:
+                return self.build_response(request, cached_response, from_cache=True)
+            # check for etags and add headers if appropriate
+            request.headers.update(self.controller.conditional_headers(request))
+        resp = super().send(request, stream, timeout, verify, cert, proxies)
+        return resp
+    def build_response(
+        self,
+        request: PreparedRequest,
+        response: HTTPResponse,
+        from_cache: bool = False,
+        cacheable_methods: Collection[str] | None = None,
+    ) -> Response:
+        """
+        Build a response by making a request or using the cache.
+        This will end up calling send and returning a potentially
+        cached response
+        """
+        cacheable = cacheable_methods or self.cacheable_methods
+        if not from_cache and request.method in cacheable:
+            # Check for any heuristics that might update headers
+            # before trying to cache.
+            if self.heuristic:
+                response = self.heuristic.apply(response)
+            # apply any expiration heuristics
+            if response.status == 304:
+                # We must have sent an ETag request. This could mean
+                # that we've been expired already or that we simply
+                # have an etag. In either case, we want to try and
+                # update the cache if that is the case.
+                cached_response = self.controller.update_cached_response(
+                    request, response
+                )
+                if cached_response is not response:
+                    from_cache = True
+                # We are done with the server response, read a
+                # possible response body (compliant servers will
+                # not return one, but we cannot be 100% sure) and
+                # release the connection back to the pool.
+                response.read(decode_content=False)
+                response.release_conn()
+                response = cached_response
+            # We always cache the 301 responses
+            elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
+                self.controller.cache_response(request, response)
+            else:
+                # Wrap the response file with a wrapper that will cache the
+                #   response when the stream has been consumed.
+                response._fp = CallbackFileWrapper(  # type: ignore[assignment]
+                    response._fp,  # type: ignore[arg-type]
+                    functools.partial(
+                        self.controller.cache_response, request, response
+                    ),
+                )
+                if response.chunked:
+                    super_update_chunk_length = response._update_chunk_length
+                    def _update_chunk_length(self: HTTPResponse) -> None:
+                        super_update_chunk_length()
+                        if self.chunk_left == 0:
+                            self._fp._close()  # type: ignore[union-attr]
+                    response._update_chunk_length = types.MethodType(  # type: ignore[method-assign]
+                        _update_chunk_length, response
+                    )
+        resp: Response = super().build_response(request, response)  # type: ignore[no-untyped-call]
+        # See if we should invalidate the cache.
+        if request.method in self.invalidating_methods and resp.ok:
+            assert request.url is not None
+            cache_url = self.controller.cache_url(request.url)
+            self.cache.delete(cache_url)
+        # Give the request a from_cache attr to let people use it
+        resp.from_cache = from_cache  # type: ignore[attr-defined]
+        return resp
+    def close(self) -> None:
+        self.cache.close()
+        super().close()  # type: ignore[no-untyped-call]

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/cache.py ADDED Viewed

	@@ -0,0 +1,74 @@

+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+The cache object API for implementing caches. The default is a thread
+safe in-memory dictionary.
+"""
+from __future__ import annotations
+from threading import Lock
+from typing import IO, TYPE_CHECKING, MutableMapping
+if TYPE_CHECKING:
+    from datetime import datetime
+class BaseCache:
+    def get(self, key: str) -> bytes | None:
+        raise NotImplementedError()
+    def set(
+        self, key: str, value: bytes, expires: int | datetime | None = None
+    ) -> None:
+        raise NotImplementedError()
+    def delete(self, key: str) -> None:
+        raise NotImplementedError()
+    def close(self) -> None:
+        pass
+class DictCache(BaseCache):
+    def __init__(self, init_dict: MutableMapping[str, bytes] | None = None) -> None:
+        self.lock = Lock()
+        self.data = init_dict or {}
+    def get(self, key: str) -> bytes | None:
+        return self.data.get(key, None)
+    def set(
+        self, key: str, value: bytes, expires: int | datetime | None = None
+    ) -> None:
+        with self.lock:
+            self.data.update({key: value})
+    def delete(self, key: str) -> None:
+        with self.lock:
+            if key in self.data:
+                self.data.pop(key)
+class SeparateBodyBaseCache(BaseCache):
+    """
+    In this variant, the body is not stored mixed in with the metadata, but is
+    passed in (as a bytes-like object) in a separate call to ``set_body()``.
+    That is, the expected interaction pattern is::
+        cache.set(key, serialized_metadata)
+        cache.set_body(key)
+    Similarly, the body should be loaded separately via ``get_body()``.
+    """
+    def set_body(self, key: str, body: bytes) -> None:
+        raise NotImplementedError()
+    def get_body(self, key: str) -> IO[bytes] | None:
+        """
+        Return the body as file-like object.
+        """
+        raise NotImplementedError()

tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/controller.py ADDED Viewed

	@@ -0,0 +1,499 @@

+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+"""
+The httplib2 algorithms ported for use with requests.
+"""
+from __future__ import annotations
+import calendar
+import logging
+import re
+import time
+from email.utils import parsedate_tz
+from typing import TYPE_CHECKING, Collection, Mapping
+from pip._vendor.requests.structures import CaseInsensitiveDict
+from pip._vendor.cachecontrol.cache import DictCache, SeparateBodyBaseCache
+from pip._vendor.cachecontrol.serialize import Serializer
+if TYPE_CHECKING:
+    from typing import Literal
+    from pip._vendor.requests import PreparedRequest
+    from pip._vendor.urllib3 import HTTPResponse
+    from pip._vendor.cachecontrol.cache import BaseCache
+logger = logging.getLogger(__name__)
+URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
+PERMANENT_REDIRECT_STATUSES = (301, 308)
+def parse_uri(uri: str) -> tuple[str, str, str, str, str]:
+    """Parses a URI using the regex given in Appendix B of RFC 3986.
+    (scheme, authority, path, query, fragment) = parse_uri(uri)
+    """
+    match = URI.match(uri)
+    assert match is not None
+    groups = match.groups()
+    return (groups[1], groups[3], groups[4], groups[6], groups[8])
+class CacheController:
+    """An interface to see if request should cached or not."""
+    def __init__(
+        self,
+        cache: BaseCache | None = None,
+        cache_etags: bool = True,
+        serializer: Serializer | None = None,
+        status_codes: Collection[int] | None = None,
+    ):
+        self.cache = DictCache() if cache is None else cache
+        self.cache_etags = cache_etags
+        self.serializer = serializer or Serializer()
+        self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308)
+    @classmethod
+    def _urlnorm(cls, uri: str) -> str:
+        """Normalize the URL to create a safe key for the cache"""
+        (scheme, authority, path, query, fragment) = parse_uri(uri)
+        if not scheme or not authority:
+            raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
+        scheme = scheme.lower()
+        authority = authority.lower()
+        if not path:
+            path = "/"
+        # Could do syntax based normalization of the URI before
+        # computing the digest. See Section 6.2.2 of Std 66.
+        request_uri = query and "?".join([path, query]) or path
+        defrag_uri = scheme + "://" + authority + request_uri
+        return defrag_uri
+    @classmethod
+    def cache_url(cls, uri: str) -> str:
+        return cls._urlnorm(uri)
+    def parse_cache_control(self, headers: Mapping[str, str]) -> dict[str, int | None]:
+        known_directives = {
+            # https://tools.ietf.org/html/rfc7234#section-5.2
+            "max-age": (int, True),
+            "max-stale": (int, False),
+            "min-fresh": (int, True),
+            "no-cache": (None, False),
+            "no-store": (None, False),
+            "no-transform": (None, False),
+            "only-if-cached": (None, False),
+            "must-revalidate": (None, False),
+            "public": (None, False),
+            "private": (None, False),
+            "proxy-revalidate": (None, False),
+            "s-maxage": (int, True),
+        }
+        cc_headers = headers.get("cache-control", headers.get("Cache-Control", ""))
+        retval: dict[str, int | None] = {}
+        for cc_directive in cc_headers.split(","):
+            if not cc_directive.strip():
+                continue
+            parts = cc_directive.split("=", 1)
+            directive = parts[0].strip()
+            try:
+                typ, required = known_directives[directive]
+            except KeyError:
+                logger.debug("Ignoring unknown cache-control directive: %s", directive)
+                continue
+            if not typ or not required:
+                retval[directive] = None
+            if typ:
+                try:
+                    retval[directive] = typ(parts[1].strip())
+                except IndexError:
+                    if required:
+                        logger.debug(
+                            "Missing value for cache-control " "directive: %s",
+                            directive,
+                        )
+                except ValueError:
+                    logger.debug(
+                        "Invalid value for cache-control directive " "%s, must be %s",
+                        directive,
+                        typ.__name__,
+                    )
+        return retval
+    def _load_from_cache(self, request: PreparedRequest) -> HTTPResponse | None:
+        """
+        Load a cached response, or return None if it's not available.
+        """
+        # We do not support caching of partial content: so if the request contains a
+        # Range header then we don't want to load anything from the cache.
+        if "Range" in request.headers:
+            return None
+        cache_url = request.url
+        assert cache_url is not None
+        cache_data = self.cache.get(cache_url)
+        if cache_data is None:
+            logger.debug("No cache entry available")
+            return None
+        if isinstance(self.cache, SeparateBodyBaseCache):
+            body_file = self.cache.get_body(cache_url)
+        else:
+            body_file = None
+        result = self.serializer.loads(request, cache_data, body_file)
+        if result is None:
+            logger.warning("Cache entry deserialization failed, entry ignored")
+        return result
+    def cached_request(self, request: PreparedRequest) -> HTTPResponse | Literal[False]:
+        """
+        Return a cached response if it exists in the cache, otherwise
+        return False.
+        """
+        assert request.url is not None
+        cache_url = self.cache_url(request.url)
+        logger.debug('Looking up "%s" in the cache', cache_url)
+        cc = self.parse_cache_control(request.headers)
+        # Bail out if the request insists on fresh data
+        if "no-cache" in cc:
+            logger.debug('Request header has "no-cache", cache bypassed')
+            return False
+        if "max-age" in cc and cc["max-age"] == 0:
+            logger.debug('Request header has "max_age" as 0, cache bypassed')
+            return False
+        # Check whether we can load the response from the cache:
+        resp = self._load_from_cache(request)
+        if not resp:
+            return False
+        # If we have a cached permanent redirect, return it immediately. We
+        # don't need to test our response for other headers b/c it is
+        # intrinsically "cacheable" as it is Permanent.
+        #
+        # See:
+        #   https://tools.ietf.org/html/rfc7231#section-6.4.2
+        #
+        # Client can try to refresh the value by repeating the request
+        # with cache busting headers as usual (ie no-cache).
+        if int(resp.status) in PERMANENT_REDIRECT_STATUSES:
+            msg = (
+                "Returning cached permanent redirect response "
+                "(ignoring date and etag information)"
+            )
+            logger.debug(msg)
+            return resp
+        headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers)
+        if not headers or "date" not in headers:
+            if "etag" not in headers:
+                # Without date or etag, the cached response can never be used
+                # and should be deleted.
+                logger.debug("Purging cached response: no date or etag")
+                self.cache.delete(cache_url)
+            logger.debug("Ignoring cached response: no date")
+            return False
+        now = time.time()
+        time_tuple = parsedate_tz(headers["date"])
+        assert time_tuple is not None
+        date = calendar.timegm(time_tuple[:6])
+        current_age = max(0, now - date)
+        logger.debug("Current age based on date: %i", current_age)
+        # TODO: There is an assumption that the result will be a
+        #       urllib3 response object. This may not be best since we
+        #       could probably avoid instantiating or constructing the
+        #       response until we know we need it.
+        resp_cc = self.parse_cache_control(headers)
+        # determine freshness
+        freshness_lifetime = 0
+        # Check the max-age pragma in the cache control header
+        max_age = resp_cc.get("max-age")
+        if max_age is not None:
+            freshness_lifetime = max_age
+            logger.debug("Freshness lifetime from max-age: %i", freshness_lifetime)
+        # If there isn't a max-age, check for an expires header
+        elif "expires" in headers:
+            expires = parsedate_tz(headers["expires"])
+            if expires is not None:
+                expire_time = calendar.timegm(expires[:6]) - date
+                freshness_lifetime = max(0, expire_time)
+                logger.debug("Freshness lifetime from expires: %i", freshness_lifetime)
+        # Determine if we are setting freshness limit in the
+        # request. Note, this overrides what was in the response.
+        max_age = cc.get("max-age")
+        if max_age is not None:
+            freshness_lifetime = max_age
+            logger.debug(
+                "Freshness lifetime from request max-age: %i", freshness_lifetime
+            )
+        min_fresh = cc.get("min-fresh")
+        if min_fresh is not None:
+            # adjust our current age by our min fresh
+            current_age += min_fresh
+            logger.debug("Adjusted current age from min-fresh: %i", current_age)
+        # Return entry if it is fresh enough
+        if freshness_lifetime > current_age:
+            logger.debug('The response is "fresh", returning cached response')
+            logger.debug("%i > %i", freshness_lifetime, current_age)
+            return resp
+        # we're not fresh. If we don't have an Etag, clear it out
+        if "etag" not in headers:
+            logger.debug('The cached response is "stale" with no etag, purging')
+            self.cache.delete(cache_url)
+        # return the original handler
+        return False
+    def conditional_headers(self, request: PreparedRequest) -> dict[str, str]:
+        resp = self._load_from_cache(request)
+        new_headers = {}
+        if resp:
+            headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers)
+            if "etag" in headers:
+                new_headers["If-None-Match"] = headers["ETag"]
+            if "last-modified" in headers:
+                new_headers["If-Modified-Since"] = headers["Last-Modified"]
+        return new_headers
+    def _cache_set(
+        self,
+        cache_url: str,
+        request: PreparedRequest,
+        response: HTTPResponse,
+        body: bytes | None = None,
+        expires_time: int | None = None,
+    ) -> None:
+        """
+        Store the data in the cache.
+        """
+        if isinstance(self.cache, SeparateBodyBaseCache):
+            # We pass in the body separately; just put a placeholder empty
+            # string in the metadata.
+            self.cache.set(
+                cache_url,
+                self.serializer.dumps(request, response, b""),
+                expires=expires_time,
+            )
+            # body is None can happen when, for example, we're only updating
+            # headers, as is the case in update_cached_response().
+            if body is not None:
+                self.cache.set_body(cache_url, body)
+        else:
+            self.cache.set(
+                cache_url,
+                self.serializer.dumps(request, response, body),
+                expires=expires_time,
+            )
+    def cache_response(
+        self,
+        request: PreparedRequest,
+        response: HTTPResponse,
+        body: bytes | None = None,
+        status_codes: Collection[int] | None = None,
+    ) -> None:
+        """
+        Algorithm for caching requests.
+        This assumes a requests Response object.
+        """
+        # From httplib2: Don't cache 206's since we aren't going to
+        #                handle byte range requests
+        cacheable_status_codes = status_codes or self.cacheable_status_codes
+        if response.status not in cacheable_status_codes:
+            logger.debug(
+                "Status code %s not in %s", response.status, cacheable_status_codes
+            )
+            return
+        response_headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(
+            response.headers
+        )
+        if "date" in response_headers:
+            time_tuple = parsedate_tz(response_headers["date"])
+            assert time_tuple is not None
+            date = calendar.timegm(time_tuple[:6])
+        else:
+            date = 0
+        # If we've been given a body, our response has a Content-Length, that
+        # Content-Length is valid then we can check to see if the body we've
+        # been given matches the expected size, and if it doesn't we'll just
+        # skip trying to cache it.
+        if (
+            body is not None
+            and "content-length" in response_headers
+            and response_headers["content-length"].isdigit()
+            and int(response_headers["content-length"]) != len(body)
+        ):
+            return
+        cc_req = self.parse_cache_control(request.headers)
+        cc = self.parse_cache_control(response_headers)
+        assert request.url is not None
+        cache_url = self.cache_url(request.url)
+        logger.debug('Updating cache with response from "%s"', cache_url)
+        # Delete it from the cache if we happen to have it stored there
+        no_store = False
+        if "no-store" in cc:
+            no_store = True
+            logger.debug('Response header has "no-store"')
+        if "no-store" in cc_req:
+            no_store = True
+            logger.debug('Request header has "no-store"')
+        if no_store and self.cache.get(cache_url):
+            logger.debug('Purging existing cache entry to honor "no-store"')
+            self.cache.delete(cache_url)
+        if no_store:
+            return
+        # https://tools.ietf.org/html/rfc7234#section-4.1:
+        # A Vary header field-value of "*" always fails to match.
+        # Storing such a response leads to a deserialization warning
+        # during cache lookup and is not allowed to ever be served,
+        # so storing it can be avoided.
+        if "*" in response_headers.get("vary", ""):
+            logger.debug('Response header has "Vary: *"')
+            return
+        # If we've been given an etag, then keep the response
+        if self.cache_etags and "etag" in response_headers:
+            expires_time = 0
+            if response_headers.get("expires"):
+                expires = parsedate_tz(response_headers["expires"])
+                if expires is not None:
+                    expires_time = calendar.timegm(expires[:6]) - date
+            expires_time = max(expires_time, 14 * 86400)
+            logger.debug(f"etag object cached for {expires_time} seconds")
+            logger.debug("Caching due to etag")
+            self._cache_set(cache_url, request, response, body, expires_time)
+        # Add to the cache any permanent redirects. We do this before looking
+        # that the Date headers.
+        elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
+            logger.debug("Caching permanent redirect")
+            self._cache_set(cache_url, request, response, b"")
+        # Add to the cache if the response headers demand it. If there
+        # is no date header then we can't do anything about expiring
+        # the cache.
+        elif "date" in response_headers:
+            time_tuple = parsedate_tz(response_headers["date"])
+            assert time_tuple is not None
+            date = calendar.timegm(time_tuple[:6])
+            # cache when there is a max-age > 0
+            max_age = cc.get("max-age")
+            if max_age is not None and max_age > 0:
+                logger.debug("Caching b/c date exists and max-age > 0")
+                expires_time = max_age
+                self._cache_set(
+                    cache_url,
+                    request,
+                    response,
+                    body,
+                    expires_time,
+                )
+            # If the request can expire, it means we should cache it
+            # in the meantime.
+            elif "expires" in response_headers:
+                if response_headers["expires"]:
+                    expires = parsedate_tz(response_headers["expires"])
+                    if expires is not None:
+                        expires_time = calendar.timegm(expires[:6]) - date
+                    else:
+                        expires_time = None
+                    logger.debug(
+                        "Caching b/c of expires header. expires in {} seconds".format(
+                            expires_time
+                        )
+                    )
+                    self._cache_set(
+                        cache_url,
+                        request,
+                        response,
+                        body,
+                        expires_time,
+                    )
+    def update_cached_response(
+        self, request: PreparedRequest, response: HTTPResponse
+    ) -> HTTPResponse:
+        """On a 304 we will get a new set of headers that we want to
+        update our cached value with, assuming we have one.
+        This should only ever be called when we've sent an ETag and
+        gotten a 304 as the response.
+        """
+        assert request.url is not None
+        cache_url = self.cache_url(request.url)
+        cached_response = self._load_from_cache(request)
+        if not cached_response:
+            # we didn't have a cached response
+            return response
+        # Lets update our headers with the headers from the new request:
+        # http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1
+        #
+        # The server isn't supposed to send headers that would make
+        # the cached body invalid. But... just in case, we'll be sure
+        # to strip out ones we know that might be problmatic due to
+        # typical assumptions.
+        excluded_headers = ["content-length"]
+        cached_response.headers.update(
+            {
+                k: v
+                for k, v in response.headers.items()
+                if k.lower() not in excluded_headers
+            }
+        )
+        # we want a 200 b/c we have content via the cache
+        cached_response.status = 200
+        # update our cache
+        self._cache_set(cache_url, request, cached_response)
+        return cached_response