diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bdf310fb4b80cb05a8d59296d53245b8ee0447b7
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/mapping.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69e010dcbcf1d511216bf00cba786b9c58136e43
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/__pycache__/registry.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd697f03eb2153c88e1522fb4c897a970d918d65
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/cached.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7dbb1c1ed8de54272f332c5ec7748bdcc406e4e
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/local.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e7e781e207661467dc1368beb005ab541c39ae50
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/reference.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f2c58c3aa9e15e6d80137c233bc565fc2508cb4
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/fsspec/implementations/__pycache__/zip.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h
new file mode 100644
index 0000000000000000000000000000000000000000..96eadad8a8e8c3979b99910ceea41ceaf2c8b58e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h
@@ -0,0 +1,891 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ * This is the public header file for the CUBLAS library, defining the API
+ *
+ * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
+ * on top of the CUDA runtime.
+ */
+
+#if !defined(CUBLAS_H_)
+#define CUBLAS_H_
+
+#if defined(CUBLAS_V2_H_)
+#error "It is an error to include both cublas.h and cublas_v2.h"
+#endif
+
+#include <cuda_runtime.h>
+
+#ifndef CUBLASWINAPI
+#ifdef _WIN32
+#define CUBLASWINAPI __stdcall
+#else
+#define CUBLASWINAPI
+#endif
+#endif
+
+#undef CUBLASAPI
+#ifdef __CUDACC__
+#define CUBLASAPI __host__
+#else
+#define CUBLASAPI
+#endif
+
+#include "cublas_api.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* CUBLAS data types */
+#define cublasStatus cublasStatus_t
+
+cublasStatus CUBLASWINAPI cublasInit(void);
+cublasStatus CUBLASWINAPI cublasShutdown(void);
+cublasStatus CUBLASWINAPI cublasGetError(void);
+
+cublasStatus CUBLASWINAPI cublasGetVersion(int* version);
+cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void** devicePtr);
+
+cublasStatus CUBLASWINAPI cublasFree(void* devicePtr);
+
+cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream);
+
+/* ---------------- CUBLAS BLAS1 functions ---------------- */
+/* NRM2 */
+float CUBLASWINAPI cublasSnrm2(int n, const float* x, int incx);
+double CUBLASWINAPI cublasDnrm2(int n, const double* x, int incx);
+float CUBLASWINAPI cublasScnrm2(int n, const cuComplex* x, int incx);
+double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* DOT */
+float CUBLASWINAPI cublasSdot(int n, const float* x, int incx, const float* y, int incy);
+double CUBLASWINAPI cublasDdot(int n, const double* x, int incx, const double* y, int incy);
+cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
+cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
+cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
+cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* SCAL */
+void CUBLASWINAPI cublasSscal(int n, float alpha, float* x, int incx);
+void CUBLASWINAPI cublasDscal(int n, double alpha, double* x, int incx);
+void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex* x, int incx);
+
+void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* AXPY */
+void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float* x, int incx, float* y, int incy);
+void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double* x, int incx, double* y, int incy);
+void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex* x, int incx, cuComplex* y, int incy);
+void CUBLASWINAPI
+cublasZaxpy(int n, cuDoubleComplex alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* COPY */
+void CUBLASWINAPI cublasScopy(int n, const float* x, int incx, float* y, int incy);
+void CUBLASWINAPI cublasDcopy(int n, const double* x, int incx, double* y, int incy);
+void CUBLASWINAPI cublasCcopy(int n, const cuComplex* x, int incx, cuComplex* y, int incy);
+void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* SWAP */
+void CUBLASWINAPI cublasSswap(int n, float* x, int incx, float* y, int incy);
+void CUBLASWINAPI cublasDswap(int n, double* x, int incx, double* y, int incy);
+void CUBLASWINAPI cublasCswap(int n, cuComplex* x, int incx, cuComplex* y, int incy);
+void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* AMAX */
+int CUBLASWINAPI cublasIsamax(int n, const float* x, int incx);
+int CUBLASWINAPI cublasIdamax(int n, const double* x, int incx);
+int CUBLASWINAPI cublasIcamax(int n, const cuComplex* x, int incx);
+int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* AMIN */
+int CUBLASWINAPI cublasIsamin(int n, const float* x, int incx);
+int CUBLASWINAPI cublasIdamin(int n, const double* x, int incx);
+
+int CUBLASWINAPI cublasIcamin(int n, const cuComplex* x, int incx);
+int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* ASUM */
+float CUBLASWINAPI cublasSasum(int n, const float* x, int incx);
+double CUBLASWINAPI cublasDasum(int n, const double* x, int incx);
+float CUBLASWINAPI cublasScasum(int n, const cuComplex* x, int incx);
+double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* ROT */
+void CUBLASWINAPI cublasSrot(int n, float* x, int incx, float* y, int incy, float sc, float ss);
+void CUBLASWINAPI cublasDrot(int n, double* x, int incx, double* y, int incy, double sc, double ss);
+void CUBLASWINAPI cublasCrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, cuComplex s);
+void CUBLASWINAPI
+cublasZrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double sc, cuDoubleComplex cs);
+void CUBLASWINAPI cublasCsrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, float s);
+void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double c, double s);
+/*------------------------------------------------------------------------*/
+/* ROTG */
+void CUBLASWINAPI cublasSrotg(float* sa, float* sb, float* sc, float* ss);
+void CUBLASWINAPI cublasDrotg(double* sa, double* sb, double* sc, double* ss);
+void CUBLASWINAPI cublasCrotg(cuComplex* ca, cuComplex cb, float* sc, cuComplex* cs);
+void CUBLASWINAPI cublasZrotg(cuDoubleComplex* ca, cuDoubleComplex cb, double* sc, cuDoubleComplex* cs);
+/*------------------------------------------------------------------------*/
+/* ROTM */
+void CUBLASWINAPI cublasSrotm(int n, float* x, int incx, float* y, int incy, const float* sparam);
+void CUBLASWINAPI cublasDrotm(int n, double* x, int incx, double* y, int incy, const double* sparam);
+/*------------------------------------------------------------------------*/
+/* ROTMG */
+void CUBLASWINAPI cublasSrotmg(float* sd1, float* sd2, float* sx1, const float* sy1, float* sparam);
+void CUBLASWINAPI cublasDrotmg(double* sd1, double* sd2, double* sx1, const double* sy1, double* sparam);
+
+/* --------------- CUBLAS BLAS2 functions  ---------------- */
+/* GEMV */
+void CUBLASWINAPI cublasSgemv(char trans,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* x,
+                              int incx,
+                              float beta,
+                              float* y,
+                              int incy);
+void CUBLASWINAPI cublasDgemv(char trans,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasCgemv(char trans,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZgemv(char trans,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* GBMV */
+void CUBLASWINAPI cublasSgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* x,
+                              int incx,
+                              float beta,
+                              float* y,
+                              int incy);
+void CUBLASWINAPI cublasDgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasCgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* TRMV */
+void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
+void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtrmv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
+void CUBLASWINAPI
+cublasZtrmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TBMV */
+void CUBLASWINAPI
+cublasStbmv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
+void CUBLASWINAPI
+cublasDtbmv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtbmv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZtbmv(
+    char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TPMV */
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
+
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
+
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
+
+void CUBLASWINAPI
+cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TRSV */
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
+
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
+
+void CUBLASWINAPI
+cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
+
+void CUBLASWINAPI
+cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TPSV */
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
+
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
+
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
+
+void CUBLASWINAPI
+cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TBSV */
+void CUBLASWINAPI
+cublasStbsv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
+
+void CUBLASWINAPI
+cublasDtbsv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtbsv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
+
+void CUBLASWINAPI cublasZtbsv(
+    char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* SYMV/HEMV */
+void CUBLASWINAPI cublasSsymv(
+    char uplo, int n, float alpha, const float* A, int lda, const float* x, int incx, float beta, float* y, int incy);
+void CUBLASWINAPI cublasDsymv(char uplo,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasChemv(char uplo,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZhemv(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* SBMV/HBMV */
+void CUBLASWINAPI cublasSsbmv(char uplo,
+                              int n,
+                              int k,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* x,
+                              int incx,
+                              float beta,
+                              float* y,
+                              int incy);
+void CUBLASWINAPI cublasDsbmv(char uplo,
+                              int n,
+                              int k,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasChbmv(char uplo,
+                              int n,
+                              int k,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZhbmv(char uplo,
+                              int n,
+                              int k,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* SPMV/HPMV */
+void CUBLASWINAPI
+cublasSspmv(char uplo, int n, float alpha, const float* AP, const float* x, int incx, float beta, float* y, int incy);
+void CUBLASWINAPI cublasDspmv(
+    char uplo, int n, double alpha, const double* AP, const double* x, int incx, double beta, double* y, int incy);
+void CUBLASWINAPI cublasChpmv(char uplo,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* AP,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZhpmv(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* AP,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+
+/*------------------------------------------------------------------------*/
+/* GER */
+void CUBLASWINAPI
+cublasSger(int m, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
+void CUBLASWINAPI
+cublasDger(int m, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
+
+void CUBLASWINAPI cublasCgeru(
+    int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
+void CUBLASWINAPI cublasCgerc(
+    int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
+void CUBLASWINAPI cublasZgeru(int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* A,
+                              int lda);
+void CUBLASWINAPI cublasZgerc(int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* A,
+                              int lda);
+/*------------------------------------------------------------------------*/
+/* SYR/HER */
+void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float* x, int incx, float* A, int lda);
+void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double* x, int incx, double* A, int lda);
+
+void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* A, int lda);
+void CUBLASWINAPI
+cublasZher(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda);
+
+/*------------------------------------------------------------------------*/
+/* SPR/HPR */
+void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float* x, int incx, float* AP);
+void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double* x, int incx, double* AP);
+void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* AP);
+void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP);
+/*------------------------------------------------------------------------*/
+/* SYR2/HER2 */
+void CUBLASWINAPI
+cublasSsyr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
+void CUBLASWINAPI
+cublasDsyr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
+void CUBLASWINAPI cublasCher2(char uplo,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* x,
+                              int incx,
+                              const cuComplex* y,
+                              int incy,
+                              cuComplex* A,
+                              int lda);
+void CUBLASWINAPI cublasZher2(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* A,
+                              int lda);
+
+/*------------------------------------------------------------------------*/
+/* SPR2/HPR2 */
+void CUBLASWINAPI
+cublasSspr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* AP);
+void CUBLASWINAPI
+cublasDspr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* AP);
+void CUBLASWINAPI cublasChpr2(
+    char uplo, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP);
+void CUBLASWINAPI cublasZhpr2(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* AP);
+/* ------------------------BLAS3 Functions ------------------------------- */
+/* GEMM */
+void CUBLASWINAPI cublasSgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* B,
+                              int ldb,
+                              float beta,
+                              float* C,
+                              int ldc);
+void CUBLASWINAPI cublasDgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* B,
+                              int ldb,
+                              double beta,
+                              double* C,
+                              int ldc);
+void CUBLASWINAPI cublasCgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* B,
+                              int ldb,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* B,
+                              int ldb,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/* -------------------------------------------------------*/
+/* SYRK */
+void CUBLASWINAPI
+cublasSsyrk(char uplo, char trans, int n, int k, float alpha, const float* A, int lda, float beta, float* C, int ldc);
+void CUBLASWINAPI cublasDsyrk(
+    char uplo, char trans, int n, int k, double alpha, const double* A, int lda, double beta, double* C, int ldc);
+
+void CUBLASWINAPI cublasCsyrk(char uplo,
+                              char trans,
+                              int n,
+                              int k,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZsyrk(char uplo,
+                              char trans,
+                              int n,
+                              int k,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/* ------------------------------------------------------- */
+/* HERK */
+void CUBLASWINAPI cublasCherk(
+    char uplo, char trans, int n, int k, float alpha, const cuComplex* A, int lda, float beta, cuComplex* C, int ldc);
+void CUBLASWINAPI cublasZherk(char uplo,
+                              char trans,
+                              int n,
+                              int k,
+                              double alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              double beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/* ------------------------------------------------------- */
+/* SYR2K */
+void CUBLASWINAPI cublasSsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               float alpha,
+                               const float* A,
+                               int lda,
+                               const float* B,
+                               int ldb,
+                               float beta,
+                               float* C,
+                               int ldc);
+
+void CUBLASWINAPI cublasDsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               double alpha,
+                               const double* A,
+                               int lda,
+                               const double* B,
+                               int ldb,
+                               double beta,
+                               double* C,
+                               int ldc);
+void CUBLASWINAPI cublasCsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuComplex alpha,
+                               const cuComplex* A,
+                               int lda,
+                               const cuComplex* B,
+                               int ldb,
+                               cuComplex beta,
+                               cuComplex* C,
+                               int ldc);
+
+void CUBLASWINAPI cublasZsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuDoubleComplex alpha,
+                               const cuDoubleComplex* A,
+                               int lda,
+                               const cuDoubleComplex* B,
+                               int ldb,
+                               cuDoubleComplex beta,
+                               cuDoubleComplex* C,
+                               int ldc);
+/* ------------------------------------------------------- */
+/* HER2K */
+void CUBLASWINAPI cublasCher2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuComplex alpha,
+                               const cuComplex* A,
+                               int lda,
+                               const cuComplex* B,
+                               int ldb,
+                               float beta,
+                               cuComplex* C,
+                               int ldc);
+
+void CUBLASWINAPI cublasZher2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuDoubleComplex alpha,
+                               const cuDoubleComplex* A,
+                               int lda,
+                               const cuDoubleComplex* B,
+                               int ldb,
+                               double beta,
+                               cuDoubleComplex* C,
+                               int ldc);
+
+/*------------------------------------------------------------------------*/
+/* SYMM*/
+void CUBLASWINAPI cublasSsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* B,
+                              int ldb,
+                              float beta,
+                              float* C,
+                              int ldc);
+void CUBLASWINAPI cublasDsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* B,
+                              int ldb,
+                              double beta,
+                              double* C,
+                              int ldc);
+
+void CUBLASWINAPI cublasCsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* B,
+                              int ldb,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+
+void CUBLASWINAPI cublasZsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* B,
+                              int ldb,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/*------------------------------------------------------------------------*/
+/* HEMM*/
+void CUBLASWINAPI cublasChemm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* B,
+                              int ldb,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZhemm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* B,
+                              int ldb,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+
+/*------------------------------------------------------------------------*/
+/* TRSM*/
+void CUBLASWINAPI cublasStrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              float* B,
+                              int ldb);
+
+void CUBLASWINAPI cublasDtrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              double* B,
+                              int ldb);
+
+void CUBLASWINAPI cublasCtrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              cuComplex* B,
+                              int ldb);
+
+void CUBLASWINAPI cublasZtrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              cuDoubleComplex* B,
+                              int ldb);
+/*------------------------------------------------------------------------*/
+/* TRMM*/
+void CUBLASWINAPI cublasStrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              float* B,
+                              int ldb);
+void CUBLASWINAPI cublasDtrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              double* B,
+                              int ldb);
+void CUBLASWINAPI cublasCtrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              cuComplex* B,
+                              int ldb);
+void CUBLASWINAPI cublasZtrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              cuDoubleComplex* B,
+                              int ldb);
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif /* !defined(CUBLAS_H_) */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe0e6f99b952514874c45208e751f5330e71570c
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h
@@ -0,0 +1,693 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*   cublasXt : Host API, Out of Core and Multi-GPU BLAS Library
+
+*/
+
+#if !defined(CUBLAS_XT_H_)
+#define CUBLAS_XT_H_
+
+#include "driver_types.h"
+#include "cuComplex.h" /* import complex data type */
+
+#include "cublas_v2.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+struct cublasXtContext;
+typedef struct cublasXtContext* cublasXtHandle_t;
+
+cublasStatus_t CUBLASWINAPI cublasXtCreate(cublasXtHandle_t* handle);
+cublasStatus_t CUBLASWINAPI cublasXtDestroy(cublasXtHandle_t handle);
+cublasStatus_t CUBLASWINAPI cublasXtGetNumBoards(int nbDevices, int deviceId[], int* nbBoards);
+cublasStatus_t CUBLASWINAPI cublasXtMaxBoards(int* nbGpuBoards);
+/* This routine selects the Gpus that the user want to use for CUBLAS-XT */
+cublasStatus_t CUBLASWINAPI cublasXtDeviceSelect(cublasXtHandle_t handle, int nbDevices, int deviceId[]);
+
+/* This routine allows to change the dimension of the tiles ( blockDim x blockDim ) */
+cublasStatus_t CUBLASWINAPI cublasXtSetBlockDim(cublasXtHandle_t handle, int blockDim);
+cublasStatus_t CUBLASWINAPI cublasXtGetBlockDim(cublasXtHandle_t handle, int* blockDim);
+
+typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t;
+/* This routine allows to CUBLAS-XT to pin the Host memory if it find out that some of the matrix passed
+   are not pinned : Pinning/Unpinning the Host memory is still a costly operation
+   It is better if the user controls the memory on its own (by pinning/unpinning oly when necessary)
+*/
+cublasStatus_t CUBLASWINAPI cublasXtGetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t* mode);
+cublasStatus_t CUBLASWINAPI cublasXtSetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t mode);
+
+/* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */
+typedef enum {
+  CUBLASXT_FLOAT = 0,
+  CUBLASXT_DOUBLE = 1,
+  CUBLASXT_COMPLEX = 2,
+  CUBLASXT_DOUBLECOMPLEX = 3,
+} cublasXtOpType_t;
+
+typedef enum {
+  CUBLASXT_GEMM = 0,
+  CUBLASXT_SYRK = 1,
+  CUBLASXT_HERK = 2,
+  CUBLASXT_SYMM = 3,
+  CUBLASXT_HEMM = 4,
+  CUBLASXT_TRSM = 5,
+  CUBLASXT_SYR2K = 6,
+  CUBLASXT_HER2K = 7,
+
+  CUBLASXT_SPMM = 8,
+  CUBLASXT_SYRKX = 9,
+  CUBLASXT_HERKX = 10,
+  CUBLASXT_TRMM = 11,
+  CUBLASXT_ROUTINE_MAX = 12,
+} cublasXtBlasOp_t;
+
+/* Currently only 32-bit integer BLAS routines are supported */
+cublasStatus_t CUBLASWINAPI cublasXtSetCpuRoutine(cublasXtHandle_t handle,
+                                                  cublasXtBlasOp_t blasOp,
+                                                  cublasXtOpType_t type,
+                                                  void* blasFunctor);
+
+/* Specified the percentage of work that should done by the CPU, default is 0 (no work) */
+cublasStatus_t CUBLASWINAPI cublasXtSetCpuRatio(cublasXtHandle_t handle,
+                                                cublasXtBlasOp_t blasOp,
+                                                cublasXtOpType_t type,
+                                                float ratio);
+
+/* GEMM */
+cublasStatus_t CUBLASWINAPI cublasXtSgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* B,
+                                          size_t ldb,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* B,
+                                          size_t ldb,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* ------------------------------------------------------- */
+/* SYRK */
+cublasStatus_t CUBLASWINAPI cublasXtSsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HERK */
+cublasStatus_t CUBLASWINAPI cublasXtCherk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const float* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const float* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZherk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const double* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const double* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* SYR2K */
+cublasStatus_t CUBLASWINAPI cublasXtSsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const float* alpha,
+                                           const float* A,
+                                           size_t lda,
+                                           const float* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           float* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const double* alpha,
+                                           const double* A,
+                                           size_t lda,
+                                           const double* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           double* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const cuComplex* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const cuDoubleComplex* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HERKX : variant extension of HERK */
+cublasStatus_t CUBLASWINAPI cublasXtCherkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZherkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+
+/* -------------------------------------------------------------------- */
+/* TRSM */
+cublasStatus_t CUBLASWINAPI cublasXtStrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          float* B,
+                                          size_t ldb);
+
+cublasStatus_t CUBLASWINAPI cublasXtDtrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          double* B,
+                                          size_t ldb);
+
+cublasStatus_t CUBLASWINAPI cublasXtCtrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          cuComplex* B,
+                                          size_t ldb);
+
+cublasStatus_t CUBLASWINAPI cublasXtZtrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          cuDoubleComplex* B,
+                                          size_t ldb);
+/* -------------------------------------------------------------------- */
+/* SYMM : Symmetric Multiply Matrix*/
+cublasStatus_t CUBLASWINAPI cublasXtSsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* B,
+                                          size_t ldb,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* B,
+                                          size_t ldb,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HEMM : Hermitian Matrix Multiply */
+cublasStatus_t CUBLASWINAPI cublasXtChemm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZhemm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+
+/* -------------------------------------------------------------------- */
+/* SYRKX : variant extension of SYRK  */
+cublasStatus_t CUBLASWINAPI cublasXtSsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const float* alpha,
+                                           const float* A,
+                                           size_t lda,
+                                           const float* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           float* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const double* alpha,
+                                           const double* A,
+                                           size_t lda,
+                                           const double* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           double* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const cuComplex* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const cuDoubleComplex* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HER2K : variant extension of HERK  */
+cublasStatus_t CUBLASWINAPI cublasXtCher2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZher2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+
+/* -------------------------------------------------------------------- */
+/* SPMM : Symmetric Packed Multiply Matrix*/
+cublasStatus_t CUBLASWINAPI cublasXtSspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* AP,
+                                          const float* B,
+                                          size_t ldb,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* AP,
+                                          const double* B,
+                                          size_t ldb,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* AP,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* AP,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+
+/* -------------------------------------------------------------------- */
+/* TRMM */
+cublasStatus_t CUBLASWINAPI cublasXtStrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* B,
+                                          size_t ldb,
+                                          float* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDtrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* B,
+                                          size_t ldb,
+                                          double* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCtrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZtrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif /* !defined(CUBLAS_XT_H_) */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h
new file mode 100644
index 0000000000000000000000000000000000000000..29ea9153faf7b3e62a6d53c0be1980ae79c49f51
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h
@@ -0,0 +1,824 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(NVBLAS_H_)
+#define NVBLAS_H_
+
+#include "driver_types.h"
+#include "cuComplex.h" /* import complex data type */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* GEMM */
+void sgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* b,
+            const int* ldb,
+            const float* beta,
+            float* c,
+            const int* ldc);
+
+void dgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* b,
+            const int* ldb,
+            const double* beta,
+            double* c,
+            const int* ldc);
+
+void cgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+void sgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           const float* b,
+           const int* ldb,
+           const float* beta,
+           float* c,
+           const int* ldc);
+
+void dgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           const double* b,
+           const int* ldb,
+           const double* beta,
+           double* c,
+           const int* ldc);
+
+void cgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* b,
+           const int* ldb,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+
+void zgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* b,
+           const int* ldb,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+
+/* SYRK */
+void ssyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* beta,
+            float* c,
+            const int* ldc);
+
+void dsyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* beta,
+            double* c,
+            const int* ldc);
+
+void csyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zsyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+void ssyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           const float* beta,
+           float* c,
+           const int* ldc);
+
+void dsyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           const double* beta,
+           double* c,
+           const int* ldc);
+
+void csyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+
+void zsyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+
+/* HERK */
+void cherk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const float* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zherk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const double* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+void cherk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const float* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const float* beta,
+           cuComplex* c,
+           const int* ldc);
+
+void zherk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const double* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const double* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+
+/* TRSM */
+void strsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            float* b,
+            const int* ldb);
+
+void dtrsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            double* b,
+            const int* ldb);
+
+void ctrsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            cuComplex* b,
+            const int* ldb);
+
+void ztrsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            cuDoubleComplex* b,
+            const int* ldb);
+
+void strsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           float* b,
+           const int* ldb);
+
+void dtrsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           double* b,
+           const int* ldb);
+
+void ctrsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           cuComplex* b,
+           const int* ldb);
+
+void ztrsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           cuDoubleComplex* b,
+           const int* ldb);
+
+/* SYMM */
+void ssymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* b,
+            const int* ldb,
+            const float* beta,
+            float* c,
+            const int* ldc);
+
+void dsymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* b,
+            const int* ldb,
+            const double* beta,
+            double* c,
+            const int* ldc);
+
+void csymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zsymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+void ssymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           const float* b,
+           const int* ldb,
+           const float* beta,
+           float* c,
+           const int* ldc);
+
+void dsymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           const double* b,
+           const int* ldb,
+           const double* beta,
+           double* c,
+           const int* ldc);
+
+void csymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* b,
+           const int* ldb,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+
+void zsymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* b,
+           const int* ldb,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+
+/* HEMM */
+void chemm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zhemm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+/* HEMM with no underscore*/
+void chemm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* b,
+           const int* ldb,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+
+void zhemm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* b,
+           const int* ldb,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+
+/* SYR2K */
+void ssyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const float* alpha,
+             const float* a,
+             const int* lda,
+             const float* b,
+             const int* ldb,
+             const float* beta,
+             float* c,
+             const int* ldc);
+
+void dsyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const double* alpha,
+             const double* a,
+             const int* lda,
+             const double* b,
+             const int* ldb,
+             const double* beta,
+             double* c,
+             const int* ldc);
+
+void csyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuComplex* alpha,
+             const cuComplex* a,
+             const int* lda,
+             const cuComplex* b,
+             const int* ldb,
+             const cuComplex* beta,
+             cuComplex* c,
+             const int* ldc);
+
+void zsyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuDoubleComplex* alpha,
+             const cuDoubleComplex* a,
+             const int* lda,
+             const cuDoubleComplex* b,
+             const int* ldb,
+             const cuDoubleComplex* beta,
+             cuDoubleComplex* c,
+             const int* ldc);
+
+/* SYR2K no_underscore*/
+void ssyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* b,
+            const int* ldb,
+            const float* beta,
+            float* c,
+            const int* ldc);
+
+void dsyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* b,
+            const int* ldb,
+            const double* beta,
+            double* c,
+            const int* ldc);
+
+void csyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zsyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+/* HERK */
+void cher2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuComplex* alpha,
+             const cuComplex* a,
+             const int* lda,
+             const cuComplex* b,
+             const int* ldb,
+             const float* beta,
+             cuComplex* c,
+             const int* ldc);
+
+void zher2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuDoubleComplex* alpha,
+             const cuDoubleComplex* a,
+             const int* lda,
+             const cuDoubleComplex* b,
+             const int* ldb,
+             const double* beta,
+             cuDoubleComplex* c,
+             const int* ldc);
+
+/* HER2K with no underscore */
+void cher2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const float* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zher2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const double* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+/* TRMM */
+void strmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            float* b,
+            const int* ldb);
+
+void dtrmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            double* b,
+            const int* ldb);
+
+void ctrmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            cuComplex* b,
+            const int* ldb);
+
+void ztrmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            cuDoubleComplex* b,
+            const int* ldb);
+
+void strmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           float* b,
+           const int* ldb);
+
+void dtrmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           double* b,
+           const int* ldb);
+
+void ctrmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           cuComplex* b,
+           const int* ldb);
+
+void ztrmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           cuDoubleComplex* b,
+           const int* ldb);
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif /* !defined(NVBLAS_H_) */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7ed27cc5337cb2e6617930b9fda168ba6d6f133
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c41f007fec4575259e842c5049a2e1f82bceca91
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0666acbb2a280d9956f39295f378c6648d3fb90
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h
new file mode 100644
index 0000000000000000000000000000000000000000..cbde94d36caaf494432a29e13aa91ca3dc7d5b51
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h
@@ -0,0 +1,758 @@
+//
+// NVIDIA_COPYRIGHT_BEGIN
+//
+// Copyright (c) 2014-2022, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+//
+// NVIDIA_COPYRIGHT_END
+//
+
+#ifndef __NVRTC_H__
+#define __NVRTC_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include <stdlib.h>
+
+
+/*************************************************************************//**
+ *
+ * \defgroup error Error Handling
+ *
+ * NVRTC defines the following enumeration type and function for API call
+ * error handling.
+ *
+ ****************************************************************************/
+
+
+/**
+ * \ingroup error
+ * \brief   The enumerated type nvrtcResult defines API call result codes.
+ *          NVRTC API functions return nvrtcResult to indicate the call
+ *          result.
+ */
+typedef enum {
+  NVRTC_SUCCESS = 0,
+  NVRTC_ERROR_OUT_OF_MEMORY = 1,
+  NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  NVRTC_ERROR_INVALID_INPUT = 3,
+  NVRTC_ERROR_INVALID_PROGRAM = 4,
+  NVRTC_ERROR_INVALID_OPTION = 5,
+  NVRTC_ERROR_COMPILATION = 6,
+  NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  NVRTC_ERROR_INTERNAL_ERROR = 11
+} nvrtcResult;
+
+
+/**
+ * \ingroup error
+ * \brief   nvrtcGetErrorString is a helper function that returns a string
+ *          describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to
+ *          \c "NVRTC_SUCCESS".
+ *          For unrecognized enumeration values, it returns
+ *          \c "NVRTC_ERROR unknown".
+ *
+ * \param   [in] result CUDA Runtime Compilation API result code.
+ * \return  Message string for the given #nvrtcResult code.
+ */
+const char *nvrtcGetErrorString(nvrtcResult result);
+
+
+/*************************************************************************//**
+ *
+ * \defgroup query General Information Query
+ *
+ * NVRTC defines the following function for general information query.
+ *
+ ****************************************************************************/
+
+
+/**
+ * \ingroup query
+ * \brief   nvrtcVersion sets the output parameters \p major and \p minor
+ *          with the CUDA Runtime Compilation version number.
+ *
+ * \param   [out] major CUDA Runtime Compilation major version number.
+ * \param   [out] minor CUDA Runtime Compilation minor version number.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *
+ */
+nvrtcResult nvrtcVersion(int *major, int *minor);
+
+
+/**
+ * \ingroup query
+ * \brief   nvrtcGetNumSupportedArchs sets the output parameter \p numArchs 
+ *          with the number of architectures supported by NVRTC. This can 
+ *          then be used to pass an array to ::nvrtcGetSupportedArchs to
+ *          get the supported architectures.
+ *
+ * \param   [out] numArchs number of supported architectures.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *
+ * see    ::nvrtcGetSupportedArchs
+ */
+nvrtcResult nvrtcGetNumSupportedArchs(int* numArchs);
+
+
+/**
+ * \ingroup query
+ * \brief   nvrtcGetSupportedArchs populates the array passed via the output parameter 
+ *          \p supportedArchs with the architectures supported by NVRTC. The array is
+ *          sorted in the ascending order. The size of the array to be passed can be
+ *          determined using ::nvrtcGetNumSupportedArchs.
+ *
+ * \param   [out] supportedArchs sorted array of supported architectures.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *
+ * see    ::nvrtcGetNumSupportedArchs
+ */
+nvrtcResult nvrtcGetSupportedArchs(int* supportedArchs);
+
+
+/*************************************************************************//**
+ *
+ * \defgroup compilation Compilation
+ *
+ * NVRTC defines the following type and functions for actual compilation.
+ *
+ ****************************************************************************/
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcProgram is the unit of compilation, and an opaque handle for
+ *          a program.
+ *
+ * To compile a CUDA program string, an instance of nvrtcProgram must be
+ * created first with ::nvrtcCreateProgram, then compiled with
+ * ::nvrtcCompileProgram.
+ */
+typedef struct _nvrtcProgram *nvrtcProgram;
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcCreateProgram creates an instance of nvrtcProgram with the
+ *          given input parameters, and sets the output parameter \p prog with
+ *          it.
+ *
+ * \param   [out] prog         CUDA Runtime Compilation program.
+ * \param   [in]  src          CUDA program source.
+ * \param   [in]  name         CUDA program name.\n
+ *                             \p name can be \c NULL; \c "default_program" is
+ *                             used when \p name is \c NULL or "".
+ * \param   [in]  numHeaders   Number of headers used.\n
+ *                             \p numHeaders must be greater than or equal to 0.
+ * \param   [in]  headers      Sources of the headers.\n
+ *                             \p headers can be \c NULL when \p numHeaders is
+ *                             0.
+ * \param   [in]  includeNames Name of each header by which they can be
+ *                             included in the CUDA program source.\n
+ *                             \p includeNames can be \c NULL when \p numHeaders
+ *                             is 0.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcDestroyProgram
+ */
+nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
+                               const char *src,
+                               const char *name,
+                               int numHeaders,
+                               const char * const *headers,
+                               const char * const *includeNames);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcDestroyProgram destroys the given program.
+ *
+ * \param    [in] prog CUDA Runtime Compilation program.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcCreateProgram
+ */
+nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcCompileProgram compiles the given program.
+ *
+ * \param   [in] prog       CUDA Runtime Compilation program.
+ * \param   [in] numOptions Number of compiler options passed.
+ * \param   [in] options    Compiler options in the form of C string array.\n
+ *                          \p options can be \c NULL when \p numOptions is 0.
+ *
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
+ *
+ * It supports compile options listed in \ref options.
+ */
+nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
+                                int numOptions, const char * const *options);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetPTXSize sets \p ptxSizeRet with the size of the PTX
+ *          generated by the previous compilation of \p prog (including the
+ *          trailing \c NULL).
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] ptxSizeRet Size of the generated PTX (including the trailing
+ *                           \c NULL).
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetPTX
+ */
+nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetPTX stores the PTX generated by the previous compilation
+ *          of \p prog in the memory pointed by \p ptx.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] ptx  Compiled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetPTXSize
+ */
+nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetCUBINSize sets \p cubinSizeRet with the size of the cubin
+ *          generated by the previous compilation of \p prog. The value of
+ *          cubinSizeRet is set to 0 if the value specified to \c -arch is a
+ *          virtual architecture instead of an actual architecture.
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] cubinSizeRet Size of the generated cubin.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetCUBIN
+ */
+nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t *cubinSizeRet);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetCUBIN stores the cubin generated by the previous compilation
+ *          of \p prog in the memory pointed by \p cubin. No cubin is available
+ *          if the value specified to \c -arch is a virtual architecture instead
+ *          of an actual architecture.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] cubin  Compiled and assembled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetCUBINSize
+ */
+nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetNVVMSize sets \p nvvmSizeRet with the size of the NVVM
+ *          generated by the previous compilation of \p prog. The value of
+ *          nvvmSizeRet is set to 0 if the program was not compiled with 
+ *          \c -dlto.
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] nvvmSizeRet Size of the generated NVVM.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetNVVM
+ */
+nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t *nvvmSizeRet);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetNVVM stores the NVVM generated by the previous compilation
+ *          of \p prog in the memory pointed by \p nvvm.
+ *          The program must have been compiled with -dlto,
+ *          otherwise will return an error.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] nvvm Compiled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetNVVMSize
+ */
+nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char *nvvm);
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
+ *          log generated by the previous compilation of \p prog (including the
+ *          trailing \c NULL).
+ *
+ * Note that compilation log may be generated with warnings and informative
+ * messages, even when the compilation of \p prog succeeds.
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] logSizeRet Size of the compilation log
+ *                           (including the trailing \c NULL).
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetProgramLog
+ */
+nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetProgramLog stores the log generated by the previous
+ *          compilation of \p prog in the memory pointed by \p log.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] log  Compilation log.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetProgramLogSize
+ */
+nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcAddNameExpression notes the given name expression
+ *          denoting the address of a __global__ function 
+ *          or __device__/__constant__ variable.
+ *
+ * The identical name expression string must be provided on a subsequent
+ * call to nvrtcGetLoweredName to extract the lowered name.
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [in] name_expression constant expression denoting the address of
+ *               a __global__ function or __device__/__constant__ variable.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink
+ *
+ * \see     ::nvrtcGetLoweredName
+ */
+nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog,
+                                   const char * const name_expression);
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetLoweredName extracts the lowered (mangled) name
+ *          for a __global__ function or __device__/__constant__ variable,
+ *          and updates *lowered_name to point to it. The memory containing
+ *          the name is released when the NVRTC program is destroyed by 
+ *          nvrtcDestroyProgram.
+ *          The identical name expression must have been previously
+ *          provided to nvrtcAddNameExpression.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [in] name_expression constant expression denoting the address of 
+ *               a __global__ function or __device__/__constant__ variable.
+ * \param   [out] lowered_name initialized by the function to point to a
+ *               C string containing the lowered (mangled)
+ *               name corresponding to the provided name expression.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink
+ *
+ * \see     ::nvrtcAddNameExpression
+ */
+nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog,
+                                const char *const name_expression,
+                                const char** lowered_name);
+
+
+/**
+ * \defgroup options Supported Compile Options
+ *
+ * NVRTC supports the compile options below.
+ * Option names with two preceding dashs (\c --) are long option names and
+ * option names with one preceding dash (\c -) are short option names.
+ * Short option names can be used instead of long option names.
+ * When a compile option takes an argument, an assignment operator (\c =)
+ * is used to separate the compile option argument from the compile option
+ * name, e.g., \c "--gpu-architecture=compute_60".
+ * Alternatively, the compile option name and the argument can be specified in
+ * separate strings without an assignment operator, .e.g,
+ * \c "--gpu-architecture" \c "compute_60".
+ * Single-character short option names, such as \c -D, \c -U, and \c -I, do
+ * not require an assignment operator, and the compile option name and the
+ * argument can be present in the same string with or without spaces between
+ * them.
+ * For instance, \c "-D=<def>", \c "-D<def>", and \c "-D <def>" are all
+ * supported.
+ *
+ * The valid compiler options are:
+ *
+ *   - Compilation targets
+ *     - \c --gpu-architecture=\<arch\> (\c -arch)\n
+ *       Specify the name of the class of GPU architectures for which the
+ *       input must be compiled.\n
+ *       - Valid <c>\<arch\></c>s:
+ *         - \c compute_35
+ *         - \c compute_37
+ *         - \c compute_50
+ *         - \c compute_52
+ *         - \c compute_53
+ *         - \c compute_60
+ *         - \c compute_61
+ *         - \c compute_62
+ *         - \c compute_70
+ *         - \c compute_72
+ *         - \c compute_75
+ *         - \c compute_80
+ *         - \c compute_87
+ *         - \c compute_89
+ *         - \c compute_90
+ *         - \c sm_35
+ *         - \c sm_37
+ *         - \c sm_50
+ *         - \c sm_52
+ *         - \c sm_53
+ *         - \c sm_60
+ *         - \c sm_61
+ *         - \c sm_62
+ *         - \c sm_70
+ *         - \c sm_72
+ *         - \c sm_75
+ *         - \c sm_80
+ *         - \c sm_87
+ *         - \c sm_89
+ *         - \c sm_90
+ *       - Default: \c compute_52
+ *   - Separate compilation / whole-program compilation
+ *     - \c --device-c (\c -dc)\n
+ *       Generate relocatable code that can be linked with other relocatable
+ *       device code.  It is equivalent to --relocatable-device-code=true.
+ *     - \c --device-w (\c -dw)\n
+ *       Generate non-relocatable code.  It is equivalent to
+ *       \c --relocatable-device-code=false.
+ *     - \c --relocatable-device-code={true|false} (\c -rdc)\n
+ *       Enable (disable) the generation of relocatable device code.
+ *       - Default: \c false
+ *     - \c --extensible-whole-program (\c -ewp)\n
+ *       Do extensible whole program compilation of device code.
+ *       - Default: \c false
+ *   - Debugging support
+ *     - \c --device-debug (\c -G)\n
+ *       Generate debug information. If --dopt is not specified, 
+ *       then turns off all optimizations.
+ *     - \c --generate-line-info (\c -lineinfo)\n
+ *       Generate line-number information.
+ *   - Code generation
+ *     - \c --dopt on (\c -dopt)\n
+ *     - \c --dopt=on \n
+ *       Enable device code optimization. When specified along with '-G', enables
+ *       limited debug information generation for optimized device code (currently,
+ *       only line number information).
+ *       When '-G' is not specified, '-dopt=on' is implicit.
+ *     - \c --ptxas-options \<options\> (\c -Xptxas)\n
+ *     - \c --ptxas-options=\<options\> \n
+ *       Specify options directly to ptxas, the PTX optimizing assembler.
+ *     - \c --maxrregcount=\<N\> (\c -maxrregcount)\n
+ *       Specify the maximum amount of registers that GPU functions can use.
+ *       Until a function-specific limit, a higher value will generally
+ *       increase the performance of individual GPU threads that execute this
+ *       function.  However, because thread registers are allocated from a
+ *       global register pool on each GPU, a higher value of this option will
+ *       also reduce the maximum thread block size, thereby reducing the amount
+ *       of thread parallelism.  Hence, a good maxrregcount value is the result
+ *       of a trade-off.  If this option is not specified, then no maximum is
+ *       assumed.  Value less than the minimum registers required by ABI will
+ *       be bumped up by the compiler to ABI minimum limit.
+ *     - \c --ftz={true|false} (\c -ftz)\n
+ *       When performing single-precision floating-point operations, flush
+ *       denormal values to zero or preserve denormal values.
+ *       \c --use_fast_math implies \c --ftz=true.
+ *       - Default: \c false
+ *     - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n
+ *       For single-precision floating-point square root, use IEEE
+ *       round-to-nearest mode or use a faster approximation.
+ *       \c --use_fast_math implies \c --prec-sqrt=false.
+ *       - Default: \c true
+ *     - \c --prec-div={true|false} (\c -prec-div)\n
+ *       For single-precision floating-point division and reciprocals, use IEEE
+ *       round-to-nearest mode or use a faster approximation.
+ *       \c --use_fast_math implies \c --prec-div=false.
+ *       - Default: \c true
+ *     - \c --fmad={true|false} (\c -fmad)\n
+ *       Enables (disables) the contraction of floating-point multiplies and
+ *       adds/subtracts into floating-point multiply-add operations (FMAD,
+ *       FFMA, or DFMA).  \c --use_fast_math implies \c --fmad=true.
+ *       - Default: \c true
+ *     - \c --use_fast_math (\c -use_fast_math)\n
+ *       Make use of fast math operations.
+ *       \c --use_fast_math implies \c --ftz=true \c --prec-div=false
+ *       \c --prec-sqrt=false \c --fmad=true.
+ *     - \c --extra-device-vectorization (\c -extra-device-vectorization)\n
+ *       Enables more aggressive device code vectorization in the NVVM optimizer.
+ *     - \c --modify-stack-limit={true|false} (\c -modify-stack-limit)\n
+ *       On Linux, during compilation, use \c setrlimit() to increase stack size 
+ *       to maximum allowed. The limit is reset to the previous value at the
+ *       end of compilation.
+ *       Note: \c setrlimit() changes the value for the entire process.
+ *       - Default: \c true
+ *     - \c --dlink-time-opt (\c -dlto)\n
+ *       Generate intermediate code for later link-time optimization.
+ *       It implies \c -rdc=true.  
+ *       Note: when this is used the nvrtcGetNVVM API should be used, 
+ *       as PTX or Cubin will not be generated.
+ *   - Preprocessing
+ *     - \c --define-macro=\<def\> (\c -D)\n
+ *       \c \<def\> can be either \c \<name\> or \c \<name=definitions\>.
+ *       - \c \<name\> \n
+ *         Predefine \c \<name\> as a macro with definition \c 1.
+ *       - \c \<name\>=\<definition\> \n
+ *         The contents of \c \<definition\> are tokenized and preprocessed
+ *         as if they appeared during translation phase three in a \c \#define
+ *         directive.  In particular, the definition will be truncated by
+ *         embedded new line characters.
+ *     - \c --undefine-macro=\<def\> (\c -U)\n
+ *       Cancel any previous definition of \c \<def\>.
+ *     - \c --include-path=\<dir\> (\c -I)\n
+ *       Add the directory \c \<dir\> to the list of directories to be
+ *       searched for headers.  These paths are searched after the list of
+ *       headers given to ::nvrtcCreateProgram.
+ *     - \c --pre-include=\<header\> (\c -include)\n
+ *       Preinclude \c \<header\> during preprocessing.
+ *     - \c --no-source-include (\c -no-source-include)
+ *       The preprocessor by default adds the directory of each input sources
+ *       to the include path. This option disables this feature and only
+ *       considers the path specified explicitly.
+ *   - Language Dialect
+ *     - \c --std={c++03|c++11|c++14|c++17}
+ *       (\c -std={c++11|c++14|c++17})\n
+ *       Set language dialect to C++03, C++11, C++14 or C++17
+ *     - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n
+ *       Provide builtin definitions of \c std::move and \c std::forward,
+ *       when C++11 language dialect is selected.
+ *       - Default: \c true
+ *     - \c --builtin-initializer-list={true|false}
+ *       (\c -builtin-initializer-list)\n
+ *       Provide builtin definitions of \c std::initializer_list class and
+ *       member functions when C++11 language dialect is selected.
+ *       - Default: \c true
+ *   - Misc.
+ *     - \c --disable-warnings (\c -w)\n
+ *       Inhibit all warning messages.
+ *     - \c --restrict (\c -restrict)\n
+ *       Programmer assertion that all kernel pointer parameters are restrict
+ *       pointers.
+ *     - \c --device-as-default-execution-space
+ *       (\c -default-device)\n
+ *       Treat entities with no execution space annotation as \c __device__
+ *       entities.
+ *     - \c --device-int128 (\c -device-int128)\n
+ *       Allow the \c __int128 type in device code. Also causes the macro \c __CUDACC_RTC_INT128__
+ *       to be defined.
+ *     - \c --optimization-info=\<kind\> (\c -opt-info)\n
+ *       Provide optimization reports for the specified kind of optimization.
+ *       The following kind tags are supported:
+ *         - \c inline : emit a remark when a function is inlined.
+ *     - \c --version-ident={true|false} (\c -dQ)\n
+ *       Embed used compiler's version info into generated PTX/CUBIN 
+ *       - Default: \c false
+ *     - \c --display-error-number (\c -err-no)\n
+ *       Display diagnostic number for warning messages. (Default)
+ *     - \c --no-display-error-number (\c -no-err-no)\n
+ *       Disables the display of a diagnostic number for warning messages.
+ *     - \c --diag-error=<error-number>,... (\c -diag-error)\n
+ *       Emit error for specified diagnostic message number(s). Message numbers can be separated by comma.
+ *     - \c --diag-suppress=<error-number>,... (\c -diag-suppress)\n
+ *       Suppress specified diagnostic message number(s). Message numbers can be separated by comma.
+ *     - \c --diag-warn=<error-number>,... (\c -diag-warn)\n
+ *       Emit warning for specified diagnostic message number(s). Message numbers can be separated by comma.
+ *
+ */
+
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+
+/* The utility function 'nvrtcGetTypeName' is not available by default. Define
+   the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available.
+*/
+   
+#if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__
+
+#if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__
+#include <cxxabi.h>
+#include <cstdlib>
+
+#elif defined(_WIN32)
+#include <Windows.h>
+#include <DbgHelp.h>
+#endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */
+
+
+#include <string>
+#include <typeinfo>
+
+template <typename T> struct __nvrtcGetTypeName_helper_t { };
+
+/*************************************************************************//**
+ *
+ * \defgroup hosthelper Host Helper
+ *
+ * NVRTC defines the following functions for easier interaction with host code.
+ *
+ ****************************************************************************/
+
+/**
+ * \ingroup hosthelper
+ * \brief   nvrtcGetTypeName stores the source level name of a type in the given 
+ *          std::string location. 
+ *
+ * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
+ * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
+ * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
+ * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
+ * otherwise *result is initialized with the extracted name.
+ * 
+ * Windows-specific notes:
+ * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(), 
+ *   which is not multi-thread safe.
+ * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
+ *
+ * \param   [in] tinfo: reference to object of type std::type_info for a given type.
+ * \param   [in] result: pointer to std::string in which to store the type name.
+ * \return
+ *  - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *  - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
+ *
+ */
+inline nvrtcResult nvrtcGetTypeName(const std::type_info &tinfo, std::string *result)
+{
+#if USE_CXXABI || __clang__ || __GNUC__
+  const char *name = tinfo.name();
+  int status;
+  char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status);
+  if (status == 0) {
+    *result = undecorated_name;
+    free(undecorated_name);
+    return NVRTC_SUCCESS;
+  }
+#elif defined(_WIN32)
+  const char *name = tinfo.raw_name();
+  if (!name || *name != '.') {
+    return NVRTC_ERROR_INTERNAL_ERROR;
+  }
+  char undecorated_name[4096];
+  //name+1 skips over the '.' prefix
+  if(UnDecorateSymbolName(name+1, undecorated_name,
+                          sizeof(undecorated_name) / sizeof(*undecorated_name),
+                           //note: doesn't seem to work correctly without UNDNAME_NO_ARGUMENTS.
+                           UNDNAME_NO_ARGUMENTS | UNDNAME_NAME_ONLY ) ) {
+    *result = undecorated_name;
+    return NVRTC_SUCCESS;
+  }
+#endif  /* USE_CXXABI || __clang__ || __GNUC__ */
+
+  return NVRTC_ERROR_INTERNAL_ERROR;
+}
+
+/**
+ * \ingroup hosthelper
+ * \brief   nvrtcGetTypeName stores the source level name of the template type argument
+ *          T in the given std::string location.
+ *
+ * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
+ * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
+ * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
+ * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
+ * otherwise *result is initialized with the extracted name.
+ * 
+ * Windows-specific notes:
+ * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(), 
+ *   which is not multi-thread safe.
+ * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
+ *
+ * \param   [in] result: pointer to std::string in which to store the type name.
+ * \return
+ *  - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *  - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
+ *
+ */
+ 
+template <typename T>
+nvrtcResult nvrtcGetTypeName(std::string *result)
+{
+  nvrtcResult res = nvrtcGetTypeName(typeid(__nvrtcGetTypeName_helper_t<T>), 
+                                     result);
+  if (res != NVRTC_SUCCESS) 
+    return res;
+
+  std::string repr = *result;
+  std::size_t idx = repr.find("__nvrtcGetTypeName_helper_t");
+  idx = (idx != std::string::npos) ? repr.find("<", idx) : idx;
+  std::size_t last_idx = repr.find_last_of('>');
+  if (idx == std::string::npos || last_idx == std::string::npos) {
+    return NVRTC_ERROR_INTERNAL_ERROR;
+  }
+  ++idx;
+  *result = repr.substr(idx, last_idx - idx);
+  return NVRTC_SUCCESS;
+}
+
+#endif  /* NVRTC_GET_TYPE_NAME */
+
+#endif /* __NVRTC_H__ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0177739620fd6bc1c2b90e536157e819967b9163
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b167111b0b387a5279da6749d946560e1c42c1b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuComplex.h
@@ -0,0 +1,348 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CU_COMPLEX_H_)
+#define CU_COMPLEX_H_
+
+#if !defined(__CUDACC_RTC__)
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#endif
+#endif
+
+/* When trying to include C header file in C++ Code extern "C" is required
+ * But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
+ * extern "C" cannot be nested
+ * Hence keep the header out of extern "C" block
+ */
+
+#if !defined(__CUDACC__)
+#include <math.h>       /* import fabsf, sqrt */
+#endif /* !defined(__CUDACC__) */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+#include "vector_types.h"
+
+typedef float2 cuFloatComplex;
+
+__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x) 
+{ 
+    return x.x; 
+}
+
+__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x) 
+{ 
+    return x.y; 
+}
+
+__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex 
+                                                             (float r, float i)
+{
+    cuFloatComplex res;
+    res.x = r;
+    res.y = i;
+    return res;
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
+{
+    return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
+}
+__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y), 
+                                cuCimagf(x) + cuCimagf(y));
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+        return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y), 
+                                    cuCimagf(x) - cuCimagf(y));
+}
+
+/* This implementation could suffer from intermediate overflow even though
+ * the final result would be in range. However, various implementations do
+ * not guard against this (presumably to avoid losing performance), so we 
+ * don't do it either to stay competitive.
+ */
+__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    cuFloatComplex prod;
+    prod = make_cuFloatComplex  ((cuCrealf(x) * cuCrealf(y)) - 
+                                 (cuCimagf(x) * cuCimagf(y)),
+                                 (cuCrealf(x) * cuCimagf(y)) + 
+                                 (cuCimagf(x) * cuCrealf(y)));
+    return prod;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Such guarded implementations are usually the default for
+ * complex library implementations, with some also offering an unguarded,
+ * faster version.
+ */
+__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
+                                                              cuFloatComplex y)
+{
+    cuFloatComplex quot;
+    float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
+    float oos = 1.0f / s;
+    float ars = cuCrealf(x) * oos;
+    float ais = cuCimagf(x) * oos;
+    float brs = cuCrealf(y) * oos;
+    float bis = cuCimagf(y) * oos;
+    s = (brs * brs) + (bis * bis);
+    oos = 1.0f / s;
+    quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
+                                ((ais * brs) - (ars * bis)) * oos);
+    return quot;
+}
+
+/* 
+ * We would like to call hypotf(), but it's not available on all platforms.
+ * This discrete implementation guards against intermediate underflow and 
+ * overflow by scaling. Otherwise we would lose half the exponent range. 
+ * There are various ways of doing guarded computation. For now chose the 
+ * simplest and fastest solution, however this may suffer from inaccuracies 
+ * if sqrt and division are not IEEE compliant. 
+ */
+__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
+{
+    float a = cuCrealf(x);
+    float b = cuCimagf(x);
+    float v, w, t;
+    a = fabsf(a);
+    b = fabsf(b);
+    if (a > b) {
+        v = a;
+        w = b; 
+    } else {
+        v = b;
+        w = a;
+    }
+    t = w / v;
+    t = 1.0f + t * t;
+    t = v * sqrtf(t);
+    if ((v == 0.0f) || (v > 3.402823466e38f) || (w > 3.402823466e38f)) {
+        t = v + w;
+    }
+    return t;
+}
+
+/* Double precision */
+typedef double2 cuDoubleComplex;
+
+__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x) 
+{ 
+    return x.x; 
+}
+
+__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x) 
+{ 
+    return x.y; 
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex 
+                                                           (double r, double i)
+{
+    cuDoubleComplex res;
+    res.x = r;
+    res.y = i;
+    return res;
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
+{
+    return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    return make_cuDoubleComplex (cuCreal(x) + cuCreal(y), 
+                                 cuCimag(x) + cuCimag(y));
+}
+
+__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    return make_cuDoubleComplex (cuCreal(x) - cuCreal(y), 
+                                 cuCimag(x) - cuCimag(y));
+}
+
+/* This implementation could suffer from intermediate overflow even though
+ * the final result would be in range. However, various implementations do
+ * not guard against this (presumably to avoid losing performance), so we 
+ * don't do it either to stay competitive.
+ */
+__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    cuDoubleComplex prod;
+    prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) - 
+                                 (cuCimag(x) * cuCimag(y)),
+                                 (cuCreal(x) * cuCimag(y)) + 
+                                 (cuCimag(x) * cuCreal(y)));
+    return prod;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Such guarded implementations are usually the default for
+ * complex library implementations, with some also offering an unguarded,
+ * faster version.
+ */
+__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
+                                                             cuDoubleComplex y)
+{
+    cuDoubleComplex quot;
+    double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
+    double oos = 1.0 / s;
+    double ars = cuCreal(x) * oos;
+    double ais = cuCimag(x) * oos;
+    double brs = cuCreal(y) * oos;
+    double bis = cuCimag(y) * oos;
+    s = (brs * brs) + (bis * bis);
+    oos = 1.0 / s;
+    quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
+                                 ((ais * brs) - (ars * bis)) * oos);
+    return quot;
+}
+
+/* This implementation guards against intermediate underflow and overflow
+ * by scaling. Otherwise we would lose half the exponent range. There are
+ * various ways of doing guarded computation. For now chose the simplest
+ * and fastest solution, however this may suffer from inaccuracies if sqrt
+ * and division are not IEEE compliant.
+ */
+__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
+{
+    double a = cuCreal(x);
+    double b = cuCimag(x);
+    double v, w, t;
+    a = fabs(a);
+    b = fabs(b);
+    if (a > b) {
+        v = a;
+        w = b; 
+    } else {
+        v = b;
+        w = a;
+    }
+    t = w / v;
+    t = 1.0 + t * t;
+    t = v * sqrt(t);
+    if ((v == 0.0) || 
+        (v > 1.79769313486231570e+308) || (w > 1.79769313486231570e+308)) {
+        t = v + w;
+    }
+    return t;
+}
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+/* aliases */
+typedef cuFloatComplex cuComplex;
+__host__ __device__ static __inline__ cuComplex make_cuComplex (float x, 
+                                                                float y) 
+{ 
+    return make_cuFloatComplex (x, y); 
+}
+
+/* float-to-double promotion */
+__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
+                                                      (cuFloatComplex c)
+{
+    return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
+}
+
+__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
+(cuDoubleComplex c)
+{
+	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
+}
+
+
+__host__ __device__ static __inline__  cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
+{
+    float real_res;
+    float imag_res;
+    
+    real_res = (cuCrealf(x) *  cuCrealf(y)) + cuCrealf(d);
+    imag_res = (cuCrealf(x) *  cuCimagf(y)) + cuCimagf(d);
+            
+    real_res = -(cuCimagf(x) * cuCimagf(y))  + real_res;  
+    imag_res =  (cuCimagf(x) *  cuCrealf(y)) + imag_res;          
+     
+    return make_cuComplex(real_res, imag_res);
+}
+
+__host__ __device__ static __inline__  cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
+{
+    double real_res;
+    double imag_res;
+    
+    real_res = (cuCreal(x) *  cuCreal(y)) + cuCreal(d);
+    imag_res = (cuCreal(x) *  cuCimag(y)) + cuCimag(d);
+            
+    real_res = -(cuCimag(x) * cuCimag(y))  + real_res;  
+    imag_res =  (cuCimag(x) *  cuCreal(y)) + imag_res;     
+     
+    return make_cuDoubleComplex(real_res, imag_res);
+}
+
+#endif /* !defined(CU_COMPLEX_H_) */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a7fe8a370330454f8a49e083899a50f7dc527ce
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier.h
@@ -0,0 +1,227 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_AWBARRIER_H_
+# define _CUDA_AWBARRIER_H_
+
+# include "cuda_awbarrier_primitives.h"
+
+# if !defined(_CUDA_AWBARRIER_SM_TARGET)
+#  error This file requires compute capability 7.0 or greater.
+# endif
+
+# if !defined(_CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER)
+#  error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+             -std=c++11 compiler option.
+# endif
+
+_CUDA_AWBARRIER_BEGIN_NAMESPACE
+
+class awbarrier {
+public:
+    class arrival_token {
+    public:
+        arrival_token() = default;
+        ~arrival_token() = default;
+        _CUDA_AWBARRIER_QUALIFIER uint32_t pending_count() const;
+    private:
+        _CUDA_AWBARRIER_QUALIFIER arrival_token(uint64_t token);
+        uint64_t token;
+        friend awbarrier;
+    };
+    awbarrier() = default;
+    awbarrier(const awbarrier&) = delete;
+    awbarrier& operator=(const awbarrier&) = delete;
+    ~awbarrier() = default;
+
+    _CUDA_AWBARRIER_QUALIFIER arrival_token arrive();
+    _CUDA_AWBARRIER_QUALIFIER arrival_token arrive_and_drop();
+    _CUDA_AWBARRIER_QUALIFIER bool timed_wait(arrival_token token, uint32_t hint_cycles);
+    _CUDA_AWBARRIER_QUALIFIER void wait(arrival_token token);
+    _CUDA_AWBARRIER_QUALIFIER void arrive_and_wait();
+    _CUDA_AWBARRIER_STATIC_QUALIFIER __host__ constexpr uint32_t max();
+private:
+    uint64_t barrier;
+    friend _CUDA_AWBARRIER_QUALIFIER void init(awbarrier* barrier, uint32_t expected_count);
+    friend _CUDA_AWBARRIER_QUALIFIER void inval(awbarrier* barrier);
+    friend class pipeline;
+};
+
+_CUDA_AWBARRIER_QUALIFIER
+uint32_t awbarrier::arrival_token::pending_count() const
+{
+    const uint32_t pending_count = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(this->token);
+#if (__CUDA_ARCH__ >= 900)
+    return pending_count;
+#else
+    return (pending_count >> 15);
+#endif
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token::arrival_token(uint64_t token)
+    : token(token)
+{
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void init(awbarrier* barrier, uint32_t expected_count)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count <= _CUDA_AWBARRIER_MAX_COUNT);
+
+#if (__CUDA_ARCH__ >= 900)
+    const uint32_t init_count = expected_count;
+#else
+    const uint32_t init_count = (expected_count << 15) + expected_count;
+#endif
+
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(&barrier->barrier, init_count);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void inval(awbarrier* barrier)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(&barrier->barrier);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token awbarrier::arrive()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+ #if (__CUDA_ARCH__ < 900)
+    const uint32_t arrive_count = 1 << 15;
+    const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<false>(&this->barrier, arrive_count);
+    (void)
+#else
+    const uint64_t token =
+ #endif
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(&this->barrier);
+
+    return arrival_token(token);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+awbarrier::arrival_token awbarrier::arrive_and_drop()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+ #if (__CUDA_ARCH__ < 900)
+    const uint32_t arrive_count = 1 << 15;
+    const uint64_t token = _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop_no_complete<true>(&this->barrier, arrive_count);
+    (void)
+#else
+    const uint64_t token =
+ #endif
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(&this->barrier);
+
+    return arrival_token(token);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier::timed_wait(arrival_token token, uint32_t hint_cycles)
+{
+    constexpr uint64_t max_busy_wait_cycles = 1024;
+    constexpr uint32_t max_sleep_ns = 1 << 20;
+
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
+        return true;
+    }
+
+    uint64_t start_cycles = clock64();
+    uint64_t elapsed_cycles = 0;
+    uint32_t sleep_ns = 32;
+    while (elapsed_cycles < hint_cycles) {
+        if (_CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(&this->barrier, token.token)) {
+            return true;
+        }
+
+        if (elapsed_cycles > max_busy_wait_cycles) {
+            __nanosleep(sleep_ns);
+            if (sleep_ns < max_sleep_ns) {
+                sleep_ns *= 2;
+            }
+        }
+
+        elapsed_cycles = clock64() - start_cycles;
+    }
+
+    return false;
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier::wait(arrival_token token)
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    while (!timed_wait(token, ~0u));
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier::arrive_and_wait()
+{
+    _CUDA_AWBARRIER_ASSERT(__isShared(&this->barrier));
+
+    this->wait(this->arrive());
+}
+
+_CUDA_AWBARRIER_QUALIFIER __host__
+constexpr uint32_t awbarrier::max()
+{
+    return _CUDA_AWBARRIER_MAX_COUNT;
+}
+
+_CUDA_AWBARRIER_END_NAMESPACE
+
+#endif /* !_CUDA_AWBARRIER_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..a112fea7830daf2934afff4aa6c14f0787a9f161
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_helpers.h
@@ -0,0 +1,350 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_AWBARRIER_HELPERS_H_
+#define _CUDA_AWBARRIER_HELPERS_H_
+
+#define _CUDA_AWBARRIER_NAMESPACE       nvcuda::experimental
+#define _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace nvcuda { namespace experimental {
+#define _CUDA_AWBARRIER_END_NAMESPACE   } }
+
+#define _CUDA_AWBARRIER_INTERNAL_NAMESPACE       _CUDA_AWBARRIER_NAMESPACE::__awbarrier_internal
+#define _CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE _CUDA_AWBARRIER_BEGIN_NAMESPACE namespace __awbarrier_internal {
+#define _CUDA_AWBARRIER_END_INTERNAL_NAMESPACE   } _CUDA_AWBARRIER_END_NAMESPACE
+
+# if !defined(_CUDA_AWBARRIER_QUALIFIER)
+#  define _CUDA_AWBARRIER_QUALIFIER inline __device__
+# endif
+# if !defined(_CUDA_AWBARRIER_STATIC_QUALIFIER)
+#  define _CUDA_AWBARRIER_STATIC_QUALIFIER static inline __device__
+#endif
+
+#if defined(__CUDA_ARCH__)
+#if  (__CUDA_ARCH__ >= 800)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_80
+#elif (__CUDA_ARCH__ >= 700)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
+#endif // No support < 700
+#else // !defined(__CUDA_ARCH__)
+# define _CUDA_AWBARRIER_SM_TARGET _CUDA_AWBARRIER_SM_70
+#endif // defined(__CUDA_ARCH__)
+
+#define _CUDA_AWBARRIER_MAX_COUNT ((1 << 14) - 1)
+
+#if defined(__cplusplus) && ((__cplusplus >= 201103L) || (defined(_MSC_VER) && (_MSC_VER >= 1900)))
+# define _CUDA_AWBARRIER_CPLUSPLUS_11_OR_LATER
+#endif
+
+#if !defined(_CUDA_AWBARRIER_DEBUG)
+# if defined(__CUDACC_DEBUG__)
+#  define _CUDA_AWBARRIER_DEBUG 1
+# else
+#  define _CUDA_AWBARRIER_DEBUG 0
+# endif
+#endif
+
+#if defined(_CUDA_AWBARRIER_DEBUG) && (_CUDA_AWBARRIER_DEBUG == 1) && !defined(NDEBUG)
+# if !defined(__CUDACC_RTC__)
+#  include <cassert>
+# endif
+# define _CUDA_AWBARRIER_ASSERT(x) assert((x));
+# define _CUDA_AWBARRIER_ABORT() assert(0);
+#else
+# define _CUDA_AWBARRIER_ASSERT(x)
+# define _CUDA_AWBARRIER_ABORT() __trap();
+#endif
+
+#if defined(__CUDACC_RTC__)
+typedef unsigned short     uint16_t;
+typedef unsigned int       uint32_t;
+typedef unsigned long long uint64_t;
+typedef uint64_t           uintptr_t;
+#else
+# include <stdint.h>
+#endif
+
+#if defined(_CUDA_AWBARRIER_SM_TARGET)
+
+typedef uint64_t __mbarrier_t;
+typedef uint64_t __mbarrier_token_t;
+
+_CUDA_AWBARRIER_BEGIN_INTERNAL_NAMESPACE
+
+extern "C" __device__ uint32_t __nvvm_get_smem_pointer(void *);
+
+namespace _CUDA_AWBARRIER_SM_70 {
+    union AWBarrier {
+        struct {
+            uint32_t expected;
+            uint32_t pending;
+        } split;
+        uint64_t raw;
+    };
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
+
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+
+        awbarrier->split.expected = 0x40000000 - expected_count;
+        awbarrier->split.pending = 0x80000000 - expected_count;
+    }
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    void __awbarrier_inval(uint64_t* barrier) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+    }
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint32_t __awbarrier_token_pending_count(uint64_t token) {
+        const uint32_t pending = token >> 32;
+        return 0x80000000 - (pending & 0x7fffffff);
+    }
+
+    template<bool _Drop>
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint64_t __awbarrier_arrive_drop(uint64_t* barrier) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+
+        while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
+
+        if (_Drop) {
+            (void)atomicAdd_block(&awbarrier->split.expected, 1);
+        }
+
+        __threadfence_block();
+
+        const uint32_t old_pending = atomicAdd_block(&awbarrier->split.pending, 1);
+        const uint32_t new_pending = old_pending + 1;
+        const bool reset = (old_pending ^ new_pending) & 0x80000000;
+
+        if (reset) {
+            __threadfence_block();
+
+            uint32_t new_expected = *reinterpret_cast<volatile uint32_t*>(&awbarrier->split.expected);
+            new_expected &= ~0x40000000;
+            if (new_expected & 0x20000000) {
+                new_expected |= 0x40000000;
+            }
+            atomicAdd_block(&awbarrier->split.pending, new_expected);
+        }
+
+        return static_cast<uint64_t>(old_pending) << 32;
+    }
+
+    template<bool _Drop>
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
+
+        AWBarrier* awbarrier = reinterpret_cast<AWBarrier*>(barrier);
+
+        while ((*reinterpret_cast<volatile uint32_t*>(&awbarrier->split.pending) & 0x7fffffff) == 0);
+
+        if (_Drop) {
+            (void)atomicAdd_block(&awbarrier->split.expected, count);
+        }
+
+        return static_cast<uint64_t>(atomicAdd_block(&awbarrier->split.pending, count)) << 32;
+    }
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+        volatile AWBarrier* awbarrier = reinterpret_cast<volatile AWBarrier*>(barrier);
+
+        return ((token >> 32) ^ awbarrier->split.pending) & 0x80000000;
+    }
+}; // namespace _CUDA_AWBARRIER_SM_70
+
+namespace _CUDA_AWBARRIER_SM_80 {
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    void __awbarrier_init(uint64_t* barrier, uint32_t expected_count) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        _CUDA_AWBARRIER_ASSERT(expected_count > 0 && expected_count < (1 << 29));
+
+        asm volatile ("mbarrier.init.shared.b64 [%0], %1;"
+                :
+                : "r"(__nvvm_get_smem_pointer(barrier)), "r"(expected_count)
+                : "memory");
+    }
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    void __awbarrier_inval(uint64_t* barrier) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+        asm volatile ("mbarrier.inval.shared.b64 [%0];"
+                :
+                : "r"(__nvvm_get_smem_pointer(barrier))
+                : "memory");
+    }
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint32_t __awbarrier_token_pending_count(uint64_t token) {
+        uint32_t __pending_count;
+
+        asm ("mbarrier.pending_count.b64 %0, %1;"
+                : "=r"(__pending_count)
+                : "l"(token));
+        return __pending_count;
+    }
+
+    template<bool _Drop>
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint64_t __awbarrier_arrive_drop(uint64_t* barrier) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+        uint64_t token;
+
+        if (_Drop) {
+            asm volatile ("mbarrier.arrive_drop.shared.b64 %0, [%1];"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier))
+                    : "memory");
+        } else {
+            asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier))
+                    : "memory");
+        }
+
+        return token;
+    }
+
+    template<bool _Drop>
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    uint64_t __awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t count) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+        _CUDA_AWBARRIER_ASSERT(count > 0 && count < (1 << 29));
+
+        uint64_t token;
+
+        if (_Drop) {
+            asm volatile ("mbarrier.arrive_drop.noComplete.shared.b64 %0, [%1], %2;"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
+                    : "memory");
+        } else {
+            asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
+                    : "=l"(token)
+                    : "r"(__nvvm_get_smem_pointer(barrier)), "r"(count)
+                    : "memory");
+        }
+
+        return token;
+    }
+
+    _CUDA_AWBARRIER_STATIC_QUALIFIER
+    bool __awbarrier_test_wait(uint64_t* barrier, uint64_t token) {
+        _CUDA_AWBARRIER_ASSERT(__isShared(barrier));
+
+        uint16_t __wait_complete;
+
+        asm volatile ("{"
+                "    .reg .pred %%p;"
+                "    mbarrier.test_wait.shared.b64 %%p, [%1], %2;"
+                "    selp.u16 %0, 1, 0, %%p;"
+                "}"
+                : "=h"(__wait_complete)
+                : "r"(__nvvm_get_smem_pointer(barrier)), "l"(token)
+                : "memory");
+        return bool(__wait_complete);
+    }
+
+}; // namespace _CUDA_AWBARRIER_SM_80
+
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier_init(uint64_t* barrier, uint32_t expected_count)
+{
+    _CUDA_AWBARRIER_SM_TARGET::__awbarrier_init(barrier, expected_count);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+void awbarrier_inval(uint64_t* barrier)
+{
+    _CUDA_AWBARRIER_SM_TARGET::__awbarrier_inval(barrier);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+uint32_t awbarrier_token_pending_count(uint64_t token)
+{
+    return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_token_pending_count(token);
+}
+
+template<bool _Drop>
+_CUDA_AWBARRIER_QUALIFIER
+uint64_t awbarrier_arrive_drop_no_complete(uint64_t* barrier, uint32_t arrive_count)
+{
+    return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop_no_complete<_Drop>(barrier, arrive_count);
+}
+
+template<bool _Drop>
+_CUDA_AWBARRIER_QUALIFIER
+uint64_t awbarrier_arrive_drop(uint64_t* barrier)
+{
+    return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_arrive_drop<_Drop>(barrier);
+}
+
+_CUDA_AWBARRIER_QUALIFIER
+bool awbarrier_test_wait(uint64_t* barrier, uint64_t token)
+{
+    return _CUDA_AWBARRIER_SM_TARGET::__awbarrier_test_wait(barrier, token);
+}
+
+_CUDA_AWBARRIER_END_INTERNAL_NAMESPACE
+
+#endif /* defined(_CUDA_AWBARRIER_SM_TARGET) */
+
+#endif /* !_CUDA_AWBARRIER_HELPERS_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h
new file mode 100644
index 0000000000000000000000000000000000000000..647110a3351477cdd8a197f53c5877648964ab8e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_awbarrier_primitives.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_AWBARRIER_PRIMITIVES_H_
+#define _CUDA_AWBARRIER_PRIMITIVES_H_
+
+#include "cuda_awbarrier_helpers.h"
+
+#if !defined(_CUDA_AWBARRIER_SM_TARGET)
+# error This file requires compute capability 7.0 or greater.
+#endif
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER __host__
+uint32_t __mbarrier_maximum_count() {
+    return _CUDA_AWBARRIER_MAX_COUNT;
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+void __mbarrier_init(__mbarrier_t* barrier, uint32_t expected_count) {
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_init(barrier, expected_count);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+void __mbarrier_inval(__mbarrier_t* barrier) {
+    _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_inval(barrier);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+__mbarrier_token_t __mbarrier_arrive(__mbarrier_t* barrier) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<false>(barrier);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+__mbarrier_token_t __mbarrier_arrive_and_drop(__mbarrier_t* barrier) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_arrive_drop<true>(barrier);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+bool __mbarrier_test_wait(__mbarrier_t* barrier, __mbarrier_token_t token) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_test_wait(barrier, token);
+}
+
+_CUDA_AWBARRIER_STATIC_QUALIFIER
+uint32_t __mbarrier_token_pending_count(__mbarrier_token_t token) {
+    return _CUDA_AWBARRIER_INTERNAL_NAMESPACE::awbarrier_token_pending_count(token);
+}
+
+#endif /* !_CUDA_AWBARRIER_PRIMITIVES_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e3858dc86d7a42270edff1c4ba801007494a38b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_bf16.hpp
@@ -0,0 +1,2683 @@
+/*
+* Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+*
+* NOTICE TO LICENSEE:
+*
+* This source code and/or documentation ("Licensed Deliverables") are
+* subject to NVIDIA intellectual property rights under U.S. and
+* international Copyright laws.
+*
+* These Licensed Deliverables contained herein is PROPRIETARY and
+* CONFIDENTIAL to NVIDIA and is being provided under the terms and
+* conditions of a form of NVIDIA software license agreement by and
+* between NVIDIA and Licensee ("License Agreement") or electronically
+* accepted by Licensee.  Notwithstanding any terms or conditions to
+* the contrary in the License Agreement, reproduction or disclosure
+* of the Licensed Deliverables to any third party without the express
+* written consent of NVIDIA is prohibited.
+*
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+* OF THESE LICENSED DELIVERABLES.
+*
+* U.S. Government End Users.  These Licensed Deliverables are a
+* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+* 1995), consisting of "commercial computer software" and "commercial
+* computer software documentation" as such terms are used in 48
+* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+* only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+* U.S. Government End Users acquire the Licensed Deliverables with
+* only those rights set forth herein.
+*
+* Any use of the Licensed Deliverables in individual and commercial
+* software must include, in the user documentation and internal
+* comments to the code, the above Disclaimer and U.S. Government End
+* Users Notice.
+*/
+
+#if !defined(__CUDA_BF16_HPP__)
+#define __CUDA_BF16_HPP__
+
+#if !defined(__CUDA_BF16_H__)
+#error "Do not include this file directly. Instead, include cuda_bf16.h."
+#endif
+
+#if !defined(_MSC_VER) && __cplusplus >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_BF16
+#elif _MSC_FULL_VER >= 190024210 && _MSVC_LANG >= 201103L
+#   define __CPP_VERSION_AT_LEAST_11_BF16
+#endif
+
+/* C++11 header for std::move. 
+ * In RTC mode, std::move is provided implicitly; don't include the header
+ */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__)
+#include <utility>
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) && !defined(__CUDACC_RTC__) */
+
+/* C++ header for std::memcpy (used for type punning in host-side implementations).
+ * When compiling as a CUDA source file memcpy is provided implicitly.
+ * !defined(__CUDACC__) implies !defined(__CUDACC_RTC__).
+ */
+#if defined(__cplusplus) && !defined(__CUDACC__)
+#include <cstring>
+#endif /* defined(__cplusplus) && !defined(__CUDACC__) */
+
+
+/* Set up function decorations */
+#if defined(__CUDACC__)
+#define __CUDA_BF16_DECL__ static __device__ __inline__
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __host__ __device__ __inline__
+#define __VECTOR_FUNCTIONS_DECL__ static __inline__ __host__ __device__
+#define __CUDA_HOSTDEVICE__ __host__ __device__
+#else /* !defined(__CUDACC__) */
+#if defined(__GNUC__)
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static __attribute__ ((unused))
+#else
+#define __CUDA_HOSTDEVICE_BF16_DECL__ static
+#endif /* defined(__GNUC__) */
+#define __CUDA_HOSTDEVICE__
+#endif /* defined(__CUDACC_) */
+
+/* Set up structure-alignment attribute */
+#if defined(__CUDACC__)
+#define __CUDA_ALIGN__(align) __align__(align)
+#else
+/* Define alignment macro based on compiler type (cannot assume C11 "_Alignas" is available) */
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#define __CUDA_ALIGN__(n) alignas(n)    /* C++11 kindly gives us a keyword for this */
+#else /* defined(__CPP_VERSION_AT_LEAST_11_BF16)*/
+#if defined(__GNUC__)
+#define __CUDA_ALIGN__(n) __attribute__ ((aligned(n)))
+#elif defined(_MSC_VER)
+#define __CUDA_ALIGN__(n) __declspec(align(n))
+#else
+#define __CUDA_ALIGN__(n)
+#endif /* defined(__GNUC__) */
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+#endif /* defined(__CUDACC__) */
+
+/* Macros to allow nv_bfloat16 & nv_bfloat162 to be used by inline assembly */
+#define __BFLOAT16_TO_US(var) *(reinterpret_cast<unsigned short *>(&(var)))
+#define __BFLOAT16_TO_CUS(var) *(reinterpret_cast<const unsigned short *>(&(var)))
+#define __BFLOAT162_TO_UI(var) *(reinterpret_cast<unsigned int *>(&(var)))
+#define __BFLOAT162_TO_CUI(var) *(reinterpret_cast<const unsigned int *>(&(var)))
+
+/**
+* Types which allow static initialization of "nv_bfloat16" and "nv_bfloat162" until
+* these become an actual builtin. Note this initialization is as a
+* bitfield representation of "nv_bfloat16", and not a conversion from short->nv_bfloat16.
+* Such a representation will be deprecated in a future version of CUDA. 
+* (Note these are visible to non-nvcc compilers, including C-only compilation)
+*/
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __nv_bfloat16_raw;
+
+typedef struct __CUDA_ALIGN__(4) {
+    unsigned short x;
+    unsigned short y;
+} __nv_bfloat162_raw;
+
+/* All other definitions in this file are only visible to C++ compilers */
+#if defined(__cplusplus)
+
+/* Hide GCC member initialization list warnings because of host/device in-function init requirement */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#pragma GCC diagnostic ignored "-Weffc++"
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+/* class' : multiple assignment operators specified
+   The class has multiple assignment operators of a single type. This warning is informational */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( push )
+#pragma warning( disable:4522 )
+#endif /* defined(__GNUC__) */
+
+struct __CUDA_ALIGN__(2) __nv_bfloat16 {
+protected:
+    unsigned short __x;
+
+public:
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    __nv_bfloat16() = default;
+#else
+    __CUDA_HOSTDEVICE__ __nv_bfloat16() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+    /* Convert to/from __nv_bfloat16_raw */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const __nv_bfloat16_raw &hr) : __x(hr.x) { }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ volatile __nv_bfloat16 &operator=(const volatile __nv_bfloat16_raw &hr) volatile { __x = hr.x; return *this; }
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const { __nv_bfloat16_raw ret; ret.x = __x; return ret; }
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat16_raw() const volatile { __nv_bfloat16_raw ret; ret.x = __x; return ret; }
+
+#if !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__)
+    /* Construct from float/double */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const float f) { __x = __float2bfloat16(f).__x;  }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(const double f) { __x = __double2bfloat16(f).__x;  }
+
+    __CUDA_HOSTDEVICE__ operator float() const { return __bfloat162float(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const float f) { __x = __float2bfloat16(f).__x; return *this; }
+
+    /* We omit "cast to double" operator, so as to not be ambiguous about up-cast */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(const double f) { __x = __double2bfloat16(f).__x; return *this; }
+
+/* Member functions only available to nvcc compilation so far */
+#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+    /* Allow automatic construction from types supported natively in hardware */
+    /* Note we do avoid constructor init-list because of special host/device compilation rules */
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(short val) { __x = __short2bfloat16_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(int val) { __x = __int2bfloat16_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned int val) { __x = __uint2bfloat16_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(long long val) { __x = __ll2bfloat16_rn(val).__x;  }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; }
+
+    /* Allow automatic casts to supported builtin types, matching all that are permitted with float */
+    __CUDA_HOSTDEVICE__ operator short() const { return __bfloat162short_rz(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(short val) { __x = __short2bfloat16_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator unsigned short() const { return __bfloat162ushort_rz(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned short val) { __x = __ushort2bfloat16_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator int() const { return __bfloat162int_rz(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(int val) { __x = __int2bfloat16_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator unsigned int() const { return __bfloat162uint_rz(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned int val) { __x = __uint2bfloat16_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator long long() const { return __bfloat162ll_rz(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(long long val) { __x = __ll2bfloat16_rn(val).__x; return *this; }
+
+    __CUDA_HOSTDEVICE__ operator unsigned long long() const { return __bfloat162ull_rz(*this); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat16 &operator=(unsigned long long val) { __x = __ull2bfloat16_rn(val).__x; return *this; }
+
+    /* Boolean conversion - note both 0 and -0 must return false */
+    __CUDA_HOSTDEVICE__ operator bool() const { return (__x & 0x7FFF) != 0; }
+#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */
+#endif /* !defined(__CUDA_NO_BFLOAT16_CONVERSIONS__) */
+};
+
+/* Global-space operator functions are only available to nvcc compilation */
+#if defined(__CUDACC__)
+
+#if __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)
+#if !defined(__CUDA_NO_BFLOAT16_OPERATORS__)
+/* Some basic arithmetic operations expected of a builtin */
+__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hadd(lh, rh); }
+__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hsub(lh, rh); }
+__device__ __forceinline__ __nv_bfloat16 operator*(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hmul(lh, rh); }
+__device__ __forceinline__ __nv_bfloat16 operator/(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hdiv(lh, rh); }
+
+__device__ __forceinline__ __nv_bfloat16 &operator+=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hadd(lh, rh); return lh; }
+__device__ __forceinline__ __nv_bfloat16 &operator-=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hsub(lh, rh); return lh; }
+__device__ __forceinline__ __nv_bfloat16 &operator*=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hmul(lh, rh); return lh; }
+__device__ __forceinline__ __nv_bfloat16 &operator/=(__nv_bfloat16 &lh, const __nv_bfloat16 &rh) { lh = __hdiv(lh, rh); return lh; }
+
+/* Note for increment and decrement we use the raw value 0x3F80 equating to nv_bfloat16(1.0f), to avoid the extra conversion */
+__device__ __forceinline__ __nv_bfloat16 &operator++(__nv_bfloat16 &h)      { __nv_bfloat16_raw one; one.x = 0x3F80; h += one; return h; }
+__device__ __forceinline__ __nv_bfloat16 &operator--(__nv_bfloat16 &h)      { __nv_bfloat16_raw one; one.x = 0x3F80; h -= one; return h; }
+__device__ __forceinline__ __nv_bfloat16  operator++(__nv_bfloat16 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat16 ret = h;
+    __nv_bfloat16_raw one;
+    one.x = 0x3F80;
+    h += one;
+    return ret;
+}
+__device__ __forceinline__ __nv_bfloat16  operator--(__nv_bfloat16 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat16 ret = h;
+    __nv_bfloat16_raw one;
+    one.x = 0x3F80;
+    h -= one;
+    return ret;
+}
+/* Unary plus and inverse operators */
+__device__ __forceinline__ __nv_bfloat16 operator+(const __nv_bfloat16 &h) { return h; }
+__device__ __forceinline__ __nv_bfloat16 operator-(const __nv_bfloat16 &h) { return __hneg(h); }
+
+/* Some basic comparison operations to make it look like a builtin */
+__device__ __forceinline__ bool operator==(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __heq(lh, rh); }
+__device__ __forceinline__ bool operator!=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hneu(lh, rh); }
+__device__ __forceinline__ bool operator> (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hgt(lh, rh); }
+__device__ __forceinline__ bool operator< (const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hlt(lh, rh); }
+__device__ __forceinline__ bool operator>=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hge(lh, rh); }
+__device__ __forceinline__ bool operator<=(const __nv_bfloat16 &lh, const __nv_bfloat16 &rh) { return __hle(lh, rh); }
+#endif /* !defined(__CUDA_NO_BFLOAT16_OPERATORS__) */
+#endif /* __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) */
+#endif /* defined(__CUDACC__) */
+
+/* __nv_bfloat162 is visible to non-nvcc host compilers */
+struct __CUDA_ALIGN__(4) __nv_bfloat162 {
+    __nv_bfloat16 x;
+    __nv_bfloat16 y;
+
+    // All construct/copy/assign/move
+public:
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+    __nv_bfloat162() = default;
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(__nv_bfloat162 &&src) { __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src)); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(__nv_bfloat162 &&src) { __BFLOAT162_TO_UI(*this) = std::move(__BFLOAT162_TO_CUI(src)); return *this; }
+#else
+    __CUDA_HOSTDEVICE__ __nv_bfloat162() { }
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat16 &a, const __nv_bfloat16 &b) : x(a), y(b) { }
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162 &src) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162 &src) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(src); return *this; }
+
+    /* Convert to/from __nv_bfloat162_raw */
+    __CUDA_HOSTDEVICE__ __nv_bfloat162(const __nv_bfloat162_raw &h2r ) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); }
+    __CUDA_HOSTDEVICE__ __nv_bfloat162 &operator=(const __nv_bfloat162_raw &h2r) { __BFLOAT162_TO_UI(*this) = __BFLOAT162_TO_CUI(h2r); return *this; }
+    __CUDA_HOSTDEVICE__ operator __nv_bfloat162_raw() const { __nv_bfloat162_raw ret; ret.x = 0U; ret.y = 0U; __BFLOAT162_TO_UI(ret) = __BFLOAT162_TO_CUI(*this); return ret; }
+};
+
+/* Global-space operator functions are only available to nvcc compilation */
+#if defined(__CUDACC__)
+
+#if (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) && !defined(__CUDA_NO_BFLOAT162_OPERATORS__)
+
+__device__ __forceinline__ __nv_bfloat162 operator+(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hadd2(lh, rh); }
+__device__ __forceinline__ __nv_bfloat162 operator-(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hsub2(lh, rh); }
+__device__ __forceinline__ __nv_bfloat162 operator*(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hmul2(lh, rh); }
+__device__ __forceinline__ __nv_bfloat162 operator/(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __h2div(lh, rh); }
+
+__device__ __forceinline__ __nv_bfloat162& operator+=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hadd2(lh, rh); return lh; }
+__device__ __forceinline__ __nv_bfloat162& operator-=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hsub2(lh, rh); return lh; }
+__device__ __forceinline__ __nv_bfloat162& operator*=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __hmul2(lh, rh); return lh; }
+__device__ __forceinline__ __nv_bfloat162& operator/=(__nv_bfloat162 &lh, const __nv_bfloat162 &rh) { lh = __h2div(lh, rh); return lh; }
+
+__device__ __forceinline__ __nv_bfloat162 &operator++(__nv_bfloat162 &h)      { __nv_bfloat162_raw one; one.x = 0x3F80; one.y = 0x3F80; h = __hadd2(h, one); return h; }
+__device__ __forceinline__ __nv_bfloat162 &operator--(__nv_bfloat162 &h)      { __nv_bfloat162_raw one; one.x = 0x3F80; one.y = 0x3F80; h = __hsub2(h, one); return h; }
+__device__ __forceinline__ __nv_bfloat162  operator++(__nv_bfloat162 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat162 ret = h;
+    __nv_bfloat162_raw one;
+    one.x = 0x3F80;
+    one.y = 0x3F80;
+    h = __hadd2(h, one);
+    return ret;
+}
+__device__ __forceinline__ __nv_bfloat162  operator--(__nv_bfloat162 &h, const int ignored)
+{
+    // ignored on purpose. Parameter only needed to distinguish the function declaration from other types of operators.
+    static_cast<void>(ignored);
+
+    const __nv_bfloat162 ret = h;
+    __nv_bfloat162_raw one;
+    one.x = 0x3F80;
+    one.y = 0x3F80;
+    h = __hsub2(h, one);
+    return ret;
+}
+__device__ __forceinline__ __nv_bfloat162 operator+(const __nv_bfloat162 &h) { return h; }
+__device__ __forceinline__ __nv_bfloat162 operator-(const __nv_bfloat162 &h) { return __hneg2(h); }
+
+__device__ __forceinline__ bool operator==(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbeq2(lh, rh); }
+__device__ __forceinline__ bool operator!=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbneu2(lh, rh); }
+__device__ __forceinline__ bool operator>(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbgt2(lh, rh); }
+__device__ __forceinline__ bool operator<(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hblt2(lh, rh); }
+__device__ __forceinline__ bool operator>=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hbge2(lh, rh); }
+__device__ __forceinline__ bool operator<=(const __nv_bfloat162 &lh, const __nv_bfloat162 &rh) { return __hble2(lh, rh); }
+
+#endif /* __CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__) */
+#endif /* defined(__CUDACC__) */
+
+/* Restore warning for multiple assignment operators */
+#if defined(_MSC_VER) && _MSC_VER >= 1500
+#pragma warning( pop )
+#endif /* defined(_MSC_VER) && _MSC_VER >= 1500 */
+
+/* Restore -Weffc++ warnings from here on */
+#if defined(__GNUC__)
+#if __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)
+#pragma GCC diagnostic pop
+#endif /* __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6) */
+#endif /* defined(__GNUC__) */
+
+#undef __CUDA_HOSTDEVICE__
+#undef __CUDA_ALIGN__
+
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short __internal_float2bfloat16(const float f, unsigned int &sign, unsigned int &remainder)
+{
+    unsigned int x;
+
+#if defined(__CUDA_ARCH__)
+    x = __float_as_uint(f);
+#elif defined(__CUDACC__)
+    (void)memcpy(&x, &f, sizeof(f));
+#else
+    (void)std::memcpy(&x, &f, sizeof(f));
+#endif
+
+    if ((x & 0x7fffffffU) > 0x7f800000U) {
+        sign = 0U;
+        remainder = 0U;
+        return static_cast<unsigned short>(0x7fffU);
+    }
+    sign = x >> 31U;
+    remainder = x << 16U;
+    return static_cast<unsigned short>(x >> 16U);
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __double2bfloat16(const double x)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("{  cvt.rn.bf16.f64 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "d"(x));
+    return val;
+#else
+
+    float f = static_cast<float>(x);
+    const double d = static_cast<double>(f);
+    unsigned int u;
+
+#if defined(__CUDA_ARCH__)
+    u = __float_as_uint(f);
+#elif defined(__CUDACC__)
+    (void)memcpy(&u, &f, sizeof(f));
+#else
+    (void)std::memcpy(&u, &f, sizeof(f));
+#endif
+    bool x_is_not_nan = ((u << (unsigned)1U) <= (unsigned)0xFF000000U);
+
+
+    if ((x > 0.0) && (d > x)) {
+        u--;
+    }
+    if ((x < 0.0) && (d < x)) {
+        u--;
+    }
+    if ((d != x) && x_is_not_nan) {
+        u |= 1U;
+    }
+
+#if defined(__CUDA_ARCH__)
+    f = __int_as_float(static_cast<int>(u));
+#elif defined(__CUDACC__)
+    (void)memcpy(&f, &u, sizeof(f));
+#else
+    (void)std::memcpy(&f, &u, sizeof(f));
+#endif
+
+    return __float2bfloat16(f);
+
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16(const float a)
+{
+    __nv_bfloat16 val;
+#if __CUDA_ARCH__ >= 800
+    asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+#else
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rn(const float a)
+{
+    __nv_bfloat16 val;
+#if __CUDA_ARCH__ >= 800
+    asm("{  cvt.rn.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+#else
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder > 0x80000000U) || ((remainder == 0x80000000U) && ((r.x & 0x1U) != 0U))) {
+        r.x++;
+    }
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rz(const float a)
+{
+    __nv_bfloat16 val;
+#if __CUDA_ARCH__ >= 800
+    asm("{  cvt.rz.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+#else
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    val = r;
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_rd(const float a)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("{  cvt.rm.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+    return val;
+#else
+    __nv_bfloat16 val;
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder != 0U) && (sign != 0U)) {
+        r.x++;
+    }
+    val = r;
+    return val;
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __float2bfloat16_ru(const float a)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("{  cvt.rp.bf16.f32 %0, %1;}\n" : "=h"(__BFLOAT16_TO_US(val)) : "f"(a));
+    return val;
+#else
+    __nv_bfloat16 val;
+    __nv_bfloat16_raw r;
+    unsigned int sign = 0U;
+    unsigned int remainder = 0U;
+    r.x = __internal_float2bfloat16(a, sign, remainder);
+    if ((remainder != 0U) && (sign == 0U)) {
+        r.x++;
+    }
+    val = r;
+    return val;
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float2bfloat162_rn(const float a)
+{
+    __nv_bfloat162 val;
+#if __CUDA_ARCH__ >= 800
+    asm("{.reg .b16 low;\n"
+        "  cvt.rn.bf16.f32 low, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a));
+#else
+    val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(a));
+#endif
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __floats2bfloat162_rn(const float a, const float b)
+{
+    __nv_bfloat162 val;
+#if __CUDA_ARCH__ >= 800
+    asm("{ cvt.rn.bf16x2.f32 %0, %2, %1;}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "f"(a), "f"(b));
+#else
+    val = __nv_bfloat162(__float2bfloat16_rn(a), __float2bfloat16_rn(b));
+#endif
+    return val;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __internal_bfloat162float(const unsigned short h)
+{
+    float f;
+#if defined(__CUDA_ARCH__)
+    #if (__CUDA_ARCH__ >= 900)
+        asm("{ cvt.f32.bf16 %0, %1;}\n" : "=f"(f) : "h"(h));
+    #else
+        asm("{ mov.b32 %0, {0,%1};}\n" : "=f"(f) : "h"(h));
+    #endif
+#else
+    unsigned int u = static_cast<unsigned int>(h) << 16;
+    #if defined(__CUDACC__)
+        (void)memcpy(&f, &u, sizeof(f));
+    #else
+        (void)std::memcpy(&f, &u, sizeof(f));
+    #endif
+#endif
+    return f;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __bfloat162float(const __nv_bfloat16 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat16_raw>(a).x);
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ float __low2float(const __nv_bfloat162 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).x);
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ float __high2float(const __nv_bfloat162 a)
+{
+    return __internal_bfloat162float(static_cast<__nv_bfloat162_raw>(a).y);
+}
+
+#if defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__))
+
+/* CUDA vector-types compatible vector creation function (note returns __nv_bfloat162, not nv_bfloat162) */
+__VECTOR_FUNCTIONS_DECL__ __nv_bfloat162 make_bfloat162(const __nv_bfloat16 x, const __nv_bfloat16 y)
+{
+    __nv_bfloat162 t; t.x = x; t.y = y; return t;
+}
+#undef __VECTOR_FUNCTIONS_DECL__
+
+
+/* Definitions of intrinsics */
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat162 __float22bfloat162_rn(const float2 a)
+{
+    __nv_bfloat162 val = __floats2bfloat162_rn(a.x, a.y);
+    return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ float2 __bfloat1622float2(const __nv_bfloat162 a)
+{
+    float hi_float;
+    float lo_float;
+    lo_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).x);
+    hi_float = __internal_bfloat162float(((__nv_bfloat162_raw)a).y);
+    return make_float2(lo_float, hi_float);
+}
+__CUDA_BF16_DECL__ int __bfloat162int_rn(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    int val;
+    asm("{  cvt.rni.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    return __float2int_rn(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ int __bfloat162int_rz(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    int val;
+    asm("{  cvt.rzi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    const float f = __bfloat162float(h);
+    int   i;
+    i = static_cast<int>(f);
+#if !(defined __CUDA_ARCH__)
+    const int max_val = (int)0x7fffffffU;
+    const int min_val = (int)0x80000000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    }
+#endif
+    return i;
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+}
+__CUDA_BF16_DECL__ int __bfloat162int_rd(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    int val;
+    asm("{  cvt.rmi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    return __float2int_rd(__bfloat162float(h));
+#endif
+}
+__CUDA_BF16_DECL__ int __bfloat162int_ru(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    int val;
+    asm("{  cvt.rpi.s32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    return __float2int_ru(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rn(const int i)
+{
+#if (defined __CUDA_ARCH__)
+    #if (__CUDA_ARCH__ >= 900)
+        __nv_bfloat16 val;
+       asm("cvt.rn.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+       return val;
+    #else
+        const float ru = __int2float_ru(i);
+        const float rd = __int2float_rd(i);
+        float rz = __int2float_rz(i);
+        if (ru != rd) {
+            rz = __uint_as_float(__float_as_uint(rz) | 1U);
+        }
+        return __float2bfloat16_rn(rz);
+    #endif
+#else
+    const double d = static_cast<double>(i);
+    return __double2bfloat16(d);
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rz(const int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rz.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#else
+    return __float2bfloat16_rz(__int2float_rz(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_rd(const int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rm.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#else
+    return __float2bfloat16_rd(__int2float_rd(i));
+#endif
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __int2bfloat16_ru(const int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rp.bf16.s32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#else
+    return __float2bfloat16_ru(__int2float_ru(i));
+#endif
+}
+
+__CUDA_BF16_DECL__ short int __bfloat162short_rn(const __nv_bfloat16 h)
+{
+   short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rni.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rni.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#endif
+   return val;
+}
+
+__CUDA_HOSTDEVICE_BF16_DECL__ short int __bfloat162short_rz(const __nv_bfloat16 h)
+{
+   short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rzi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#elif (defined __CUDA_ARCH__)
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rzi.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+    const float f = __bfloat162float(h);
+    val = static_cast<short int>(f);
+    const short int max_val = (short int)0x7fffU;
+    const short int min_val = (short int)0x8000U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        val = 0;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        val = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        val = min_val;
+    }
+#endif
+   return val;
+}
+__CUDA_BF16_DECL__ short int __bfloat162short_rd(const __nv_bfloat16 h)
+{
+   short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rmi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rmi.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#endif
+   return val;
+}
+__CUDA_BF16_DECL__ short int __bfloat162short_ru(const __nv_bfloat16 h)
+{
+   short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rpi.s16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rpi.s16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#endif
+   return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rn(const short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    const float f = static_cast<float>(i);
+    return __float2bfloat16_rn(f);
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rz(const short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rz.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    return __float2bfloat16_rz(__int2float_rz(static_cast<int>(i)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_rd(const short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rm.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    return __float2bfloat16_rd(__int2float_rd(static_cast<int>(i)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short2bfloat16_ru(const short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rp.bf16.s16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    return __float2bfloat16_ru(__int2float_ru(static_cast<int>(i)));
+#endif
+}
+
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rn(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned int val;
+    asm("{  cvt.rni.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    return __float2uint_rn(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned int __bfloat162uint_rz(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned int val;
+    asm("{  cvt.rzi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+
+    const float f = __bfloat162float(h);
+    unsigned int i;
+    i = static_cast<unsigned int>(f);
+#if !(defined __CUDA_ARCH__)
+    const unsigned int max_val = 0xffffffffU;
+    const unsigned int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0U;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    }
+#endif
+    return i;
+
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+}
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_rd(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned int val;
+    asm("{  cvt.rmi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    return __float2uint_rd(__bfloat162float(h));
+#endif
+}
+__CUDA_BF16_DECL__ unsigned int __bfloat162uint_ru(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned int val;
+    asm("{  cvt.rpi.u32.bf16 %0, %1;}\n" : "=r"(val) : "h"(__BFLOAT16_TO_CUS(h)));
+    return val;
+#else
+    return __float2uint_ru(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rn(const unsigned int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rn.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#elif (defined __CUDA_ARCH__)
+    const float ru = __uint2float_ru(i);
+    const float rd = __uint2float_rd(i);
+    float rz = __uint2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+#else
+    const double d = static_cast<double>(i);
+    return __double2bfloat16(d);
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rz(const unsigned int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rz.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#else
+    return __float2bfloat16_rz(__uint2float_rz(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_rd(const unsigned int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rm.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#else
+    return __float2bfloat16_rd(__uint2float_rd(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __uint2bfloat16_ru(const unsigned int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+     __nv_bfloat16 val;
+    asm("cvt.rp.bf16.u32 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "r"(i));
+    return val;
+#else
+    return __float2bfloat16_ru(__uint2float_ru(i));
+#endif
+}
+
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rn(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rni.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rni.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#endif
+   return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned short int __bfloat162ushort_rz(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rzi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#elif (defined __CUDA_ARCH__)
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rzi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+    const float f = __bfloat162float(h);
+    val = static_cast<unsigned short int>(f);
+    const unsigned short int max_val = 0xffffU;
+    const unsigned short int min_val = 0U;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        val = 0U;
+    } else if (f > static_cast<float>(max_val)) {
+        // saturate maximum
+        val = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        val = min_val;
+    }
+#endif
+   return val;
+}
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_rd(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rmi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rmi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#endif
+   return val;
+}
+__CUDA_BF16_DECL__ unsigned short int __bfloat162ushort_ru(const __nv_bfloat16 h)
+{
+   unsigned short int val;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm("cvt.rpi.u16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+   asm("{ .reg.f32 f;\n"
+       "  mov.b32 f, {0,%1};\n"
+       "  cvt.rpi.u16.f32 %0,f;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(h)));
+#endif
+   return val;
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rn(const unsigned short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rn.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    const float f = static_cast<float>(i);
+    return __float2bfloat16_rn(f);
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rz(const unsigned short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rz.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    return __float2bfloat16_rz(__uint2float_rz(static_cast<unsigned int>(i)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_rd(const unsigned short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rm.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    return __float2bfloat16_rd(__uint2float_rd(static_cast<unsigned int>(i)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort2bfloat16_ru(const unsigned short int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 val;
+    asm("cvt.rp.bf16.u16 %0, %1;" : "=h"(__BFLOAT16_TO_US(val)) : "h"(i));
+    return val;
+#else
+    return __float2bfloat16_ru(__uint2float_ru(static_cast<unsigned int>(i)));
+#endif
+}
+
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rn(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned long long int i;
+    asm("cvt.rni.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    return __float2ull_rn(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ unsigned long long int __bfloat162ull_rz(const __nv_bfloat16 h)
+{
+    unsigned long long int i;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    asm("cvt.rzi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    const float f = __bfloat162float(h);
+    i = static_cast<unsigned long long int>(f);
+#if !(defined __CUDA_ARCH__)
+    const unsigned long long int max_val = 0xffffffffffffffffULL;
+    const unsigned long long int min_val = 0ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = 0x8000000000000000ULL;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    }
+#endif
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    return i;
+}
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_rd(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned long long int i;
+    asm("cvt.rmi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    return __float2ull_rd(__bfloat162float(h));
+#endif
+}
+__CUDA_BF16_DECL__ unsigned long long int __bfloat162ull_ru(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    unsigned long long int i;
+    asm("cvt.rpi.u64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    return __float2ull_ru(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rn(const unsigned long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rn.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#elif (defined __CUDA_ARCH__)
+    const float ru = __ull2float_ru(i);
+    const float rd = __ull2float_rd(i);
+    float rz = __ull2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+#else
+    float f = static_cast<float>(i);
+    const unsigned long long int uf = static_cast<unsigned long long int>(f);
+    unsigned int u;
+
+    #if defined(__CUDA_ARCH__)
+        u = __float_as_uint(f);
+    #elif defined(__CUDACC__)
+        (void)memcpy(&u, &f, sizeof(f));
+    #else
+        (void)std::memcpy(&u, &f, sizeof(f));
+    #endif
+
+    // round up happened here
+    // note: no need to handle round up to f == 0x1.p64 specially
+    if (uf > i) {
+        u--;
+    }
+    if (uf != i) {
+        u |= 1U;
+    }
+
+    #if defined(__CUDA_ARCH__)
+        f = __int_as_float(static_cast<int>(u));
+    #elif defined(__CUDACC__)
+        (void)memcpy(&f, &u, sizeof(f));
+    #else
+        (void)std::memcpy(&f, &u, sizeof(f));
+    #endif
+
+    return __float2bfloat16_rn(f);
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rz(const unsigned long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rz.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#else
+    return __float2bfloat16_rz(__ull2float_rz(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_rd(const unsigned long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rm.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#else
+    return __float2bfloat16_rd(__ull2float_rd(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ull2bfloat16_ru(const unsigned long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rp.bf16.u64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#else
+    return __float2bfloat16_ru(__ull2float_ru(i));
+#endif
+}
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rn(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    long long int i;
+    asm("cvt.rni.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    return __float2ll_rn(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ long long int __bfloat162ll_rz(const __nv_bfloat16 h)
+{
+    long long int i;
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    asm("cvt.rzi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+#else
+    const float f = __bfloat162float(h);
+    i = static_cast<long long int>(f);
+#if !(defined __CUDA_ARCH__)
+    const long long int max_val = (long long int)0x7fffffffffffffffULL;
+    const long long int min_val = (long long int)0x8000000000000000ULL;
+    const unsigned short bits = static_cast<unsigned short>(static_cast<__nv_bfloat16_raw>(h).x << 1U);
+    // saturation fixup
+    if (bits > (unsigned short)0xFF00U) {
+        // NaN
+        i = min_val;
+    } else if (f >= static_cast<float>(max_val)) {
+        // saturate maximum
+        i = max_val;
+    } else if (f < static_cast<float>(min_val)) {
+        // saturate minimum
+        i = min_val;
+    }
+#endif
+#endif // defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    return i;
+}
+__CUDA_BF16_DECL__ long long int __bfloat162ll_rd(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    long long int i;
+    asm("cvt.rmi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    return __float2ll_rd(__bfloat162float(h));
+#endif
+}
+__CUDA_BF16_DECL__ long long int __bfloat162ll_ru(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    long long int i;
+    asm("cvt.rpi.s64.bf16 %0, %1;" : "=l"(i) : "h"(__BFLOAT16_TO_CUS(h)));
+    return i;
+#else
+    return __float2ll_ru(__bfloat162float(h));
+#endif
+}
+__CUDA_HOSTDEVICE_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rn(const long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rn.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#elif (defined __CUDA_ARCH__)
+    const float ru = __ll2float_ru(i);
+    const float rd = __ll2float_rd(i);
+    float rz = __ll2float_rz(i);
+    if (ru != rd) {
+        rz = __uint_as_float(__float_as_uint(rz) | 1U);
+    }
+    return __float2bfloat16_rn(rz);
+#else
+    float f = static_cast<float>(i);
+    const long long int lf = static_cast<long long int>(f);
+    unsigned int u;
+
+    #if defined(__CUDA_ARCH__)
+        u = __float_as_uint(f);
+    #elif defined(__CUDACC__)
+        (void)memcpy(&u, &f, sizeof(f));
+    #else
+        (void)std::memcpy(&u, &f, sizeof(f));
+    #endif
+
+    if ((f > 0.0f) && (lf > i)) {
+        u--;
+    }
+    if ((f < 0.0f) && (lf < i)) {
+        u--;
+    }
+    if (lf != i) {
+        u |= 1U;
+    }
+
+    #if defined(__CUDA_ARCH__)
+        f = __int_as_float(static_cast<int>(u));
+    #elif defined(__CUDACC__)
+        (void)memcpy(&f, &u, sizeof(f));
+    #else
+        (void)std::memcpy(&f, &u, sizeof(f));
+    #endif
+
+    return __float2bfloat16_rn(f);
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rz(const long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rz.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#else
+    return __float2bfloat16_rz(__ll2float_rz(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_rd(const long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rm.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#else
+    return __float2bfloat16_rd(__ll2float_rd(i));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ll2bfloat16_ru(const long long int i)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 h;
+    asm("cvt.rp.bf16.s64 %0, %1;" : "=h"(__BFLOAT16_TO_US(h)) : "l"(i));
+    return h;
+#else
+    return __float2bfloat16_ru(__ll2float_ru(i));
+#endif
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 htrunc(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 r;
+    asm("cvt.rzi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+#else
+    return __float2bfloat16_rz(truncf(__bfloat162float(h)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hceil(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 r;
+    asm("cvt.rpi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+#else
+    return __float2bfloat16_ru(ceilf(__bfloat162float(h)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hfloor(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 r;
+    asm("cvt.rmi.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+#else
+    return __float2bfloat16_rd(floorf(__bfloat162float(h)));
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrint(const __nv_bfloat16 h)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 r;
+    asm("cvt.rni.bf16.bf16 %0, %1;" : "=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(h)));
+    return r;
+#else
+    return __float2bfloat16_rn(rintf(__bfloat162float(h)));
+#endif
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2trunc(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low = __float2bfloat16_rz(truncf(__low2float(h)));
+    const __nv_bfloat16 high = __float2bfloat16_rz(truncf(__high2float(h)));
+    return __nv_bfloat162(low, high);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2ceil(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low = __float2bfloat16_ru(ceilf(__low2float(h)));
+    const __nv_bfloat16 high = __float2bfloat16_ru(ceilf(__high2float(h)));
+    return __nv_bfloat162(low, high);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2floor(const __nv_bfloat162 h)
+{
+    const __nv_bfloat16 low = __float2bfloat16_rd(floorf(__low2float(h)));
+    const __nv_bfloat16 high = __float2bfloat16_rd(floorf(__high2float(h)));
+    return __nv_bfloat162(low, high);
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rint(const __nv_bfloat162 h)
+{
+    return __halves2bfloat162(hrint(__low2bfloat16(h)), hrint(__high2bfloat16(h)));
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __lows2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+    asm("{.reg .b16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {alow,blow};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __highs2bfloat162(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+    asm("{.reg .b16 alow,ahigh,blow,bhigh;\n"
+        "  mov.b32 {alow,ahigh}, %1;\n"
+        "  mov.b32 {blow,bhigh}, %2;\n"
+        "  mov.b32 %0, {ahigh,bhigh};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)), "r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __low2bfloat16(const __nv_bfloat162 a)
+{
+    __nv_bfloat16 ret;
+    asm("{.reg .b16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, low;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return ret;
+}
+__CUDA_BF16_DECL__ int __hisinf(const __nv_bfloat16 a)
+{
+    int retval;
+    if (__BFLOAT16_TO_CUS(a) == 0xFF80U) {
+        retval = -1;
+    } else if (__BFLOAT16_TO_CUS(a) == 0x7F80U) {
+        retval = 1;
+    } else {
+        retval = 0;
+    }
+    return retval;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __low2bfloat162(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {low,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __high2bfloat162(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,high};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __high2bfloat16(const __nv_bfloat162 a)
+{
+    __nv_bfloat16 ret;
+    asm("{.reg .b16 low,high;\n"
+        " mov.b32 {low,high}, %1;\n"
+        " mov.b16 %0, high;}" : "=h"(__BFLOAT16_TO_US(ret)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __halves2bfloat162(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat162 val;
+    asm("{  mov.b32 %0, {%1,%2};}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __bfloat162bfloat162(const __nv_bfloat16 a)
+{
+    __nv_bfloat162 val;
+    asm("{  mov.b32 %0, {%1,%1};}\n"
+        : "=r"(__BFLOAT162_TO_UI(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __lowhigh2highlow(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 val;
+    asm("{.reg .b16 low,high;\n"
+        "  mov.b32 {low,high}, %1;\n"
+        "  mov.b32 %0, {high,low};}\n" : "=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ short int __bfloat16_as_short(const __nv_bfloat16 h)
+{
+    return static_cast<short int>(__BFLOAT16_TO_CUS(h));
+}
+__CUDA_BF16_DECL__ unsigned short int __bfloat16_as_ushort(const __nv_bfloat16 h)
+{
+    return __BFLOAT16_TO_CUS(h);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __short_as_bfloat16(const short int i)
+{
+    __nv_bfloat16 h;
+    __BFLOAT16_TO_US(h) = static_cast<unsigned short int>(i);
+    return h;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ushort_as_bfloat16(const unsigned short int i)
+{
+    __nv_bfloat16 h;
+    __BFLOAT16_TO_US(h) = i;
+    return h;
+}
+
+/******************************************************************************
+*                           __nv_bfloat16, __nv_bfloat162 warp shuffle                     *
+******************************************************************************/
+#define __SHUFFLE_SYNC_BFLOAT162_MACRO(name) /* do */ {\
+   __nv_bfloat162 r; \
+   asm volatile ("{" __CUDA_BF16_STRINGIFY(name) " %0,%1,%2,%3,%4;\n}" \
+       :"=r"(__BFLOAT162_TO_UI(r)): "r"(__BFLOAT162_TO_CUI(var)), "r"(delta), "r"(c), "r"(mask)); \
+   return r; \
+} /* while(0) */
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.idx.b32)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_up_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = (warp_size - static_cast<unsigned>(width)) << 8U;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.up.b32)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_down_sync(const unsigned mask, const __nv_bfloat162 var, const unsigned int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.down.b32)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __shfl_xor_sync(const unsigned mask, const __nv_bfloat162 var, const int delta, const int width)
+{
+    unsigned int warp_size;
+    asm("{mov.u32 %0, WARP_SZ;\n}" : "=r"(warp_size));
+    const unsigned int c = ((warp_size - static_cast<unsigned>(width)) << 8U) | 0x1fU;
+    __SHUFFLE_SYNC_BFLOAT162_MACRO(shfl.sync.bfly.b32)
+}
+
+#undef __SHUFFLE_SYNC_BFLOAT162_MACRO
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_up_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_up_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_down_sync(const unsigned mask, const __nv_bfloat16 var, const unsigned int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_down_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __shfl_xor_sync(const unsigned mask, const __nv_bfloat16 var, const int delta, const int width)
+{
+    const __nv_bfloat162 temp1 = __halves2bfloat162(var, var);
+    const __nv_bfloat162 temp2 = __shfl_xor_sync(mask, temp1, delta, width);
+    return __low2bfloat16(temp2);
+}
+
+/******************************************************************************
+*               __nv_bfloat16 and __nv_bfloat162 __ldg,__ldcg,__ldca,__ldcs                *
+******************************************************************************/
+
+#if defined(__cplusplus)
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __LDG_PTR   "l"
+#else
+#define __LDG_PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldg(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.nc.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldg(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.nc.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcg(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cg.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcg(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cg.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldca(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.ca.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldca(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.ca.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcs(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cs.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcs(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cs.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr));
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldlu(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.lu.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldlu(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.lu.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __ldcv(const  __nv_bfloat162 *const ptr)
+{
+    __nv_bfloat162 ret;
+    asm ("ld.global.cv.b32 %0, [%1];"  : "=r"(__BFLOAT162_TO_UI(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __ldcv(const __nv_bfloat16 *const ptr)
+{
+    __nv_bfloat16 ret;
+    asm ("ld.global.cv.b16 %0, [%1];"  : "=h"(__BFLOAT16_TO_US(ret)) : __LDG_PTR(ptr) : "memory");
+    return ret;
+}
+
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.wb.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwb(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.wb.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.cg.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcg(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.cg.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.cs.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stcs(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.cs.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat162 *const ptr, const __nv_bfloat162 value)
+{
+    asm ("st.global.wt.b32 [%0], %1;"  :: __LDG_PTR(ptr), "r"(__BFLOAT162_TO_CUI(value)) : "memory");
+}
+__CUDA_BF16_DECL__ void __stwt(__nv_bfloat16 *const ptr, const __nv_bfloat16 value)
+{
+    asm ("st.global.wt.b16 [%0], %1;"  :: __LDG_PTR(ptr),  "h"(__BFLOAT16_TO_CUS(value)) : "memory");
+}
+
+#undef __LDG_PTR
+#endif /*defined(__cplusplus) */
+/******************************************************************************
+*                             __nv_bfloat162 comparison                             *
+******************************************************************************/
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   __nv_bfloat162 val; \
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val; \
+}
+#else
+#define __COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   __nv_bfloat162 val; \
+   asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\
+        "  and.b32 high_a, %1, 0xffff0000U;\n"\
+        "  and.b32 high_b, %2, 0xffff0000U;\n"\
+        "  shl.b32 low_a, %1, 16;\n"\
+        "  shl.b32 low_b, %2, 16;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\
+        "  shr.u32 low_res, low_res, 16;\n"\
+        "  or.b32  %0, high_res, low_res;}\n"\
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val; \
+}
+#endif
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __heq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.eq)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ne)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hle2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.le)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ge)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hlt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.lt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.gt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.equ)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.neu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.leu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.geu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.ltu)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __COMPARISON_OP_BFLOAT162_MACRO(set.gtu)
+}
+#undef __COMPARISON_OP_BFLOAT162_MACRO
+
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   __nv_bfloat162 val; \
+   bool retval; \
+   asm( "{ " __CUDA_BF16_STRINGIFY(name) ".bf16x2.bf16x2 %0,%1,%2;\n}" \
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   if (__BFLOAT162_TO_CUI(val) == 0x3F803F80U) {\
+      retval = true; \
+   } else { \
+      retval = false; \
+   }\
+   return retval;\
+}
+#else
+
+#define __BOOL_COMPARISON_OP_BFLOAT162_MACRO(name) {\
+   unsigned int val; \
+   asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\
+        "  and.b32 high_a, %1, 0xffff0000U;\n"\
+        "  and.b32 high_b, %2, 0xffff0000U;\n"\
+        "  shl.b32 low_a, %1, 16;\n"\
+        "  shl.b32 low_b, %2, 16;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 low_res, low_a, low_b;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32.f32 high_res, high_a, high_b;\n"\
+        "  and.b32 %0, high_res, low_res;}\n"\
+        :"=r"(val) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return (val != 0U) ? true : false; \
+}
+#endif
+
+__CUDA_BF16_DECL__ bool __hbeq2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.eq)
+}
+__CUDA_BF16_DECL__ bool __hbne2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ne)
+}
+__CUDA_BF16_DECL__ bool __hble2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.le)
+}
+__CUDA_BF16_DECL__ bool __hbge2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ge)
+}
+__CUDA_BF16_DECL__ bool __hblt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.lt)
+}
+__CUDA_BF16_DECL__ bool __hbgt2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gt)
+}
+__CUDA_BF16_DECL__ bool __hbequ2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.equ)
+}
+__CUDA_BF16_DECL__ bool __hbneu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.neu)
+}
+__CUDA_BF16_DECL__ bool __hbleu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.leu)
+}
+__CUDA_BF16_DECL__ bool __hbgeu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.geu)
+}
+__CUDA_BF16_DECL__ bool __hbltu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.ltu)
+}
+__CUDA_BF16_DECL__ bool __hbgtu2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __BOOL_COMPARISON_OP_BFLOAT162_MACRO(set.gtu)
+}
+#undef __BOOL_COMPARISON_OP_BFLOAT162_MACRO
+/******************************************************************************
+*                             __nv_bfloat16 comparison                              *
+******************************************************************************/
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\
+   unsigned short val; \
+   asm( "{ .reg .pred __$temp3;\n" \
+        "  setp." __CUDA_BF16_STRINGIFY(name) ".bf16  __$temp3, %1, %2;\n" \
+        "  selp.u16 %0, 1, 0, __$temp3;}" \
+        : "=h"(val) : "h"(__BFLOAT16_TO_CUS(a)), "h"(__BFLOAT16_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+}
+#else
+#define __COMPARISON_OP_BFLOAT16_MACRO(name) {\
+   unsigned int val; \
+   asm( "{.reg .b32 a,b;\n"\
+        "  mov.b32 a, {0, %1};\n"\
+        "  mov.b32 b, {0, %2};\n"\
+        "  set." __CUDA_BF16_STRINGIFY(name) ".f32.f32 %0, a, b;}\n"\
+        :"=r"(val) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return (val != 0U) ? true : false; \
+}
+#endif
+__CUDA_BF16_DECL__ bool __heq(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(eq)
+}
+__CUDA_BF16_DECL__ bool __hne(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(ne)
+}
+__CUDA_BF16_DECL__ bool __hle(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(le)
+}
+__CUDA_BF16_DECL__ bool __hge(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(ge)
+}
+__CUDA_BF16_DECL__ bool __hlt(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(lt)
+}
+__CUDA_BF16_DECL__ bool __hgt(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(gt)
+}
+__CUDA_BF16_DECL__ bool __hequ(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(equ)
+}
+__CUDA_BF16_DECL__ bool __hneu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(neu)
+}
+__CUDA_BF16_DECL__ bool __hleu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(leu)
+}
+__CUDA_BF16_DECL__ bool __hgeu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(geu)
+}
+__CUDA_BF16_DECL__ bool __hltu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(ltu)
+}
+__CUDA_BF16_DECL__ bool __hgtu(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __COMPARISON_OP_BFLOAT16_MACRO(gtu)
+}
+#undef __COMPARISON_OP_BFLOAT16_MACRO
+/******************************************************************************
+*                            __nv_bfloat162 arithmetic                             *
+******************************************************************************/
+#define __BINARY_OP_BFLOAT162_MACRO(name) /* do */ {\
+   __nv_bfloat162 val; \
+   asm( "{.reg .b32 low_a,low_b,high_a,high_b,high_res,low_res;\n"\
+        " .reg .b16 low,high;\n"\
+        "  and.b32 high_a, %1, 0xffff0000U;\n"\
+        "  and.b32 high_b, %2, 0xffff0000U;\n"\
+        "  shl.b32 low_a, %1, 16;\n"\
+        "  shl.b32 low_b, %2, 16;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32 low_res, low_a, low_b;\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32 high_res, high_a, high_b;\n"\
+        "  cvt.rn.bf16.f32 low, low_res;\n"\
+        "  cvt.rn.bf16.f32 high, high_res;\n"\
+        "  mov.b32 %0, {low,high};}\n"\
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val; \
+} /* while(0) */
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ add.bf16x2 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x3f803f80U;\n"
+        "  fma.rn.bf16x2 %0,%1,c,%2;}\n"
+#endif
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ sub.bf16x2 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 %0,%2,c,%1;}\n"
+#endif
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ mul.bf16x2 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x80008000U;\n"
+        "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
+#endif
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ add.rn.bf16x2 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x3f803f80U;\n"
+        "  fma.rn.bf16x2 %0,%1,c,%2;}\n"
+#endif
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ sub.rn.bf16x2 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 %0,%2,c,%1;}\n"
+#endif
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_rn(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ mul.rn.bf16x2 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b32 c;\n"
+        "  mov.b32 c, 0x80008000U;\n"
+        "  fma.rn.bf16x2 %0,%1,%2,c;}\n"
+#endif
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hadd2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+   asm( "{.reg .b32 f, one, zero;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  fma.rn.bf16x2 f,%1,one,%2;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hsub2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+   asm( "{.reg .b32 f, one, zero, mone;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  mov.b32 mone, 0xbf80bf80U;\n"
+        "  fma.rn.bf16x2 f,%2,mone,%1;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmul2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+   __nv_bfloat162 val;
+   asm( "{.reg .b32 f, one, zero, mzero;\n"
+        "  mov.b32 one, 0x3f803f80U;\n"
+        "  mov.b32 zero, 0;\n"
+        "  mov.b32 mzero, 0x80008000U;\n"
+        "  fma.rn.bf16x2 f,%1,%2,mzero;\n"
+        "  max.bf16x2 f, f, zero;\n"
+        "  min.bf16x2 %0, f, one;\n}"
+        :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{fma.rn.bf16x2 %0,%1,%2,%3;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_sat(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{ .reg .b32 f, one, zero;\n"
+         "  mov.b32 one, 0x3f803f80U;\n"
+         "  mov.b32 zero, 0;\n"
+         "  fma.rn.bf16x2 f, %1, %2, %3;\n"
+         "  max.bf16x2 f, f, zero;\n"
+         "  min.bf16x2 %0, f, one;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __h2div(const __nv_bfloat162 a, const __nv_bfloat162 b) {
+    __nv_bfloat16 ha, hb;
+
+    ha = __low2bfloat16(a);
+    hb = __low2bfloat16(b);
+
+    const __nv_bfloat16 v1 = __hdiv(ha, hb);
+
+    ha = __high2bfloat16(a);
+    hb = __high2bfloat16(b);
+
+    const __nv_bfloat16 v2 = __hdiv(ha, hb);
+
+    return __halves2bfloat162(v1, v2);
+}
+/******************************************************************************
+*                             __nv_bfloat16 arithmetic                             *
+******************************************************************************/
+#define __BINARY_OP_BFLOAT16_MACRO(name) /* do */ {\
+   __nv_bfloat16 val; \
+   asm( "{.reg .b32 a,b,res;\n"\
+        "  mov.b32 a, {0,%1};\n"\
+        "  mov.b32 b, {0,%2};\n"\
+        "  " __CUDA_BF16_STRINGIFY(name) ".f32 res, a, b;\n"\
+        "  cvt.rn.bf16.f32 %0, res;}\n"\
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val; \
+} /* while(0) */
+
+__CUDA_BF16_DECL__ __nv_bfloat16 __hadd(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ add.bf16 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x3f80U;\n"
+        "  fma.rn.bf16 %0,%1,c,%2;}\n"
+#endif
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsub(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ sub.bf16 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0xbf80U;\n"
+        "  fma.rn.bf16 %0,%2,c,%1;}\n"
+#endif
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmul(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ mul.bf16 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x8000U;\n"
+        "  fma.rn.bf16 %0,%1,%2,c;}\n"
+#endif
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ add.rn.bf16 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x3f80U;\n"
+        "  fma.rn.bf16 %0,%1,c,%2;}\n"
+#endif
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ sub.rn.bf16 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0xbf80U;\n"
+        "  fma.rn.bf16 %0,%2,c,%1;}\n"
+#endif
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_rn(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+   asm( "{ mul.rn.bf16 %0,%1,%2; }\n"
+#else
+   asm( "{.reg .b16 c;\n"
+        "  mov.b16 c, 0x8000U;\n"
+        "  fma.rn.bf16 %0,%1,%2,c;}\n"
+#endif
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b))); \
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hadd_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+    asm( "{ .reg .b16 f, one, zero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  fma.rn.bf16 f, %1, one, %2;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsub_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+    asm( "{ .reg .b16 f, one, zero, mone;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  mov.b16 mone, 0xbf80U;\n"
+         "  fma.rn.bf16 f, %2, mone, %1;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmul_sat(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+    asm( "{ .reg .b16 f, one, zero, mzero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  mov.b16 mzero, 0x8000U;\n"
+         "  fma.rn.bf16 f, %1, %2, mzero;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{fma.rn.bf16 %0,%1,%2,%3;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_sat(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{ .reg .b16 f, one, zero;\n"
+         "  mov.b16 one, 0x3f80U;\n"
+         "  mov.b16 zero, 0;\n"
+         "  fma.rn.bf16 f, %1, %2, %3;\n"
+         "  max.bf16 f, f, zero;\n"
+         "  min.bf16 %0, f, one;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hdiv(const __nv_bfloat16 a, const __nv_bfloat16 b) {
+    __BINARY_OP_BFLOAT16_MACRO(div.rn)
+}
+
+/******************************************************************************
+*                             __nv_bfloat162 functions                  *
+******************************************************************************/
+#define __APPROX_FCAST(fun) /* do */ {\
+   __nv_bfloat16 val;\
+   asm("{.reg.b32         f;        \n"\
+                " .reg.b16         r;        \n"\
+                "  mov.b16         r,%1;     \n"\
+                "  mov.b32         f,{0,r};  \n"\
+                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   f,f;  \n"\
+                "  cvt.rn.bf16.f32    r,f;  \n"\
+                "  mov.b16         %0,r;     \n"\
+                "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));\
+   return val;\
+} /* while(0) */
+#define __APPROX_FCAST2(fun) /* do */ {\
+   __nv_bfloat162 val;\
+   asm("{.reg.b16         hl, hu;         \n"\
+                " .reg.b32         fl, fu;         \n"\
+                "  mov.b32         {hl, hu}, %1;   \n"\
+                "  mov.b32         fl, {0,hl};     \n"\
+                "  mov.b32         fu, {0,hu};     \n"\
+                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   fl, fl;     \n"\
+                "  " __CUDA_BF16_STRINGIFY(fun) ".approx.f32   fu, fu;     \n"\
+                "  cvt.rn.bf16.f32    hl, fl;     \n"\
+                "  cvt.rn.bf16.f32    hu, fu;     \n"\
+                "  mov.b32         %0, {hl, hu};   \n"\
+                "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));       \
+   return val;\
+} /* while(0) */
+__CUDA_BF16_DECL__ __nv_bfloat16 __hsin_internal(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+    f = sinf(f);
+    return __float2bfloat16_rn(f);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hsin(const __nv_bfloat16 a) {
+    return __hsin_internal(a);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sin(const __nv_bfloat162 a) {
+    const __nv_bfloat16 l = __low2bfloat16(a);
+    const __nv_bfloat16 h = __high2bfloat16(a);
+    return __halves2bfloat162(__hsin_internal(l), __hsin_internal(h));
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hcos_internal(const __nv_bfloat16 a) {
+    float f = __bfloat162float(a);
+    f = cosf(f);
+    return __float2bfloat16_rn(f);
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hcos(const __nv_bfloat16 a) {
+    return __hcos_internal(a);
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2cos(const __nv_bfloat162 a) {
+    const __nv_bfloat16 l = __low2bfloat16(a);
+    const __nv_bfloat16 h = __high2bfloat16(a);
+    return __halves2bfloat162(__hcos_internal(l), __hcos_internal(h));
+}
+
+#define __BF16_SPEC_CASE2(i,r, spc, ulp) \
+   "{.reg.b32 spc, ulp, p;\n"\
+   "  mov.b32 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\
+   "  mov.b32 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16x2.f16x2 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.bf16x2 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n"
+#define __BF16_SPEC_CASE(i,r, spc, ulp) \
+   "{.reg.b16 spc, ulp, p;\n"\
+   "  mov.b16 spc," __CUDA_BF16_STRINGIFY(spc) ";\n"\
+   "  mov.b16 ulp," __CUDA_BF16_STRINGIFY(ulp) ";\n"\
+   "  set.eq.f16.f16 p," __CUDA_BF16_STRINGIFY(i) ", spc;\n"\
+   "  fma.rn.bf16 " __CUDA_BF16_STRINGIFY(r) ",p,ulp," __CUDA_BF16_STRINGIFY(r) ";\n}\n"
+
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp(const __nv_bfloat16 a) {
+    __nv_bfloat16 val;
+    asm("{.reg.b32          f, C;           \n"
+        " .reg.b16          h,r;            \n"
+        "  mov.b16          h,%1;           \n"
+        "  mov.b32          f,{0,h};        \n"
+        "  mov.b32          C, 0x3FB8AA3CU;  \n"
+        "  mul.f32          f,f,C;          \n"
+        "  ex2.approx.f32   f,f;            \n"
+        "  cvt.rn.bf16.f32 r,f;            \n"
+        "  mov.b16          %0,r;           \n"
+        "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp(const __nv_bfloat162 a) {
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu, C;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         h, %1;          \n"
+        "  mov.b32         fl, {0,hl};     \n"
+        "  mov.b32         fu, {0,hu};     \n"
+        "  mov.b32         C, 0x3FB8AA3CU;  \n"
+        "  mul.f32         fl,fl,C;        \n"
+        "  mul.f32         fu,fu,C;        \n"
+        "  ex2.approx.f32      fl, fl;     \n"
+        "  ex2.approx.f32      fu, fu;     \n"
+        "  cvt.rn.bf16.f32    hl, fl;     \n"
+        "  cvt.rn.bf16.f32    hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp2(const __nv_bfloat16 a) {
+    __APPROX_FCAST(ex2)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp2(const __nv_bfloat162 a) {
+    __APPROX_FCAST2(ex2)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hexp10(const __nv_bfloat16 a) {
+    __nv_bfloat16 val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f, C;           \n"
+        "  mov.b16         h, %1;          \n"
+        "  mov.b32         f, {0,h};       \n"
+        "  mov.b32         C, 0x40549A78U;  \n"
+        "  mul.f32         f,f,C;          \n"
+        "  ex2.approx.f32      f, f;       \n"
+        "  cvt.rn.bf16.f32    r, f;       \n"
+        __BF16_SPEC_CASE(%1, r, 0xBC95U,0xBF00U)
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2exp10(const __nv_bfloat162 a) {
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;         \n"
+        " .reg.b32         h,r,fl,fu, C;   \n"
+        "  mov.b32         {hl, hu}, %1;   \n"
+        "  mov.b32         fl, {0,hl};     \n"
+        "  mov.b32         fu, {0,hu};     \n"
+        "  mov.b32         C, 0x40549A78U;  \n"
+        "  mul.f32         fl,fl,C;        \n"
+        "  mul.f32         fu,fu,C;        \n"
+        "  ex2.approx.f32      fl, fl;     \n"
+        "  ex2.approx.f32      fu, fu;     \n"
+        "  cvt.rn.bf16.f32    hl, fl;     \n"
+        "  cvt.rn.bf16.f32    hu, fu;     \n"
+        "  mov.b32         r, {hl, hu};    \n"
+        __BF16_SPEC_CASE2(%1, r, 0xBC95BC95U,0xBF00BF00U)
+        "  mov.b32         %0, r;  \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog2(const __nv_bfloat16 a) {
+    __APPROX_FCAST(lg2)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log2(const __nv_bfloat162 a) {
+    __APPROX_FCAST2(lg2)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog(const __nv_bfloat16 a) {
+    __nv_bfloat16 val;
+    asm("{.reg.b32         f, C;           \n"
+        " .reg.b16         r,h;            \n"
+        "  mov.b16         h,%1;           \n"
+        "  mov.b32         f,{0,h};        \n"
+        "  lg2.approx.f32      f,f;        \n"
+        "  mov.b32         C, 0x3f317218U; \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.bf16.f32    r,f;        \n"
+        "  mov.b16         %0,r;           \n"
+        "}": "=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log(const __nv_bfloat162 a) {
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  mov.b32         fl, {0,hl};         \n"
+        "  mov.b32         fu, {0,hu};         \n"
+        "  lg2.approx.f32      fl, fl;         \n"
+        "  lg2.approx.f32      fu, fu;         \n"
+        "  mov.b32         C, 0x3f317218U;     \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.bf16.f32    hl, fl;         \n"
+        "  cvt.rn.bf16.f32    hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hlog10(const __nv_bfloat16 a) {
+    __nv_bfloat16 val;
+    asm("{.reg.b16         h, r;           \n"
+        " .reg.b32         f, C;           \n"
+        "  mov.b16         h, %1;          \n"
+        "  mov.b32         f, {0,h};           \n"
+        "  lg2.approx.f32      f, f;       \n"
+        "  mov.b32         C, 0x3E9A209BU;  \n"
+        "  mul.f32         f,f,C;          \n"
+        "  cvt.rn.bf16.f32    r, f;       \n"
+        "  mov.b16         %0, r;          \n"
+        "}":"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2log10(const __nv_bfloat162 a) {
+    __nv_bfloat162 val;
+    asm("{.reg.b16         hl, hu;             \n"
+        " .reg.b32         r, fl, fu, C, h;    \n"
+        "  mov.b32         {hl, hu}, %1;       \n"
+        "  mov.b32         h, %1;              \n"
+        "  mov.b32         fl, {0,hl};         \n"
+        "  mov.b32         fu, {0,hu};         \n"
+        "  lg2.approx.f32      fl, fl;         \n"
+        "  lg2.approx.f32      fu, fu;         \n"
+        "  mov.b32         C, 0x3E9A209BU;      \n"
+        "  mul.f32         fl,fl,C;            \n"
+        "  mul.f32         fu,fu,C;            \n"
+        "  cvt.rn.bf16.f32    hl, fl;         \n"
+        "  cvt.rn.bf16.f32    hu, fu;         \n"
+        "  mov.b32         r, {hl, hu};        \n"
+        "  mov.b32         %0, r;              \n"
+        "}":"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return val;
+}
+#undef __BF16_SPEC_CASE2
+#undef __BF16_SPEC_CASE
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rcp(const __nv_bfloat162 a) {
+    __APPROX_FCAST2(rcp)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrcp(const __nv_bfloat16 a) {
+    __APPROX_FCAST(rcp)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2rsqrt(const __nv_bfloat162 a) {
+    __APPROX_FCAST2(rsqrt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hrsqrt(const __nv_bfloat16 a) {
+    __APPROX_FCAST(rsqrt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 h2sqrt(const __nv_bfloat162 a) {
+    __APPROX_FCAST2(sqrt)
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 hsqrt(const __nv_bfloat16 a) {
+    __APPROX_FCAST(sqrt)
+}
+#undef __APPROX_FCAST
+#undef __APPROX_FCAST2
+__CUDA_BF16_DECL__ __nv_bfloat162 __hisnan2(const __nv_bfloat162 a)
+{
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat162 r;
+    asm("{set.nan.bf16x2.bf16x2 %0,%1,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return r;
+#else
+    const __nv_bfloat162 b = a;
+    __BINARY_OP_BFLOAT162_MACRO(set.nan.f32)
+#endif
+}
+__CUDA_BF16_DECL__ bool __hisnan(const __nv_bfloat16 a)
+{
+#if (defined __CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 r;
+    asm("{set.nan.bf16.bf16 %0,%1,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return __BFLOAT16_TO_CUS(r) != 0U;
+#else
+    unsigned int r;
+    asm( "{.reg .b32 a;\n"
+         "  mov.b32 a, {0,%1};\n"
+         "  set.nan.f32.f32 %0, a, a;}\n"
+         :"=r"(r) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r != 0U;
+#endif
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hneg2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+    asm("{neg.bf16x2 %0,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return r;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hneg(const __nv_bfloat16 a)
+{
+    __nv_bfloat16 r;
+    asm("{neg.bf16 %0,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __habs2(const __nv_bfloat162 a)
+{
+    __nv_bfloat162 r;
+    asm("{abs.bf16x2 %0,%1;\n}"
+        :"=r"(__BFLOAT162_TO_UI(r)) : "r"(__BFLOAT162_TO_CUI(a)));
+    return r;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __habs(const __nv_bfloat16 a)
+{
+    __nv_bfloat16 r;
+    asm("{abs.bf16 %0,%1;\n}"
+        :"=h"(__BFLOAT16_TO_US(r)) : "h"(__BFLOAT16_TO_CUS(a)));
+    return r;
+}
+/******************************************************************************
+*                             __nv_bfloat16 arithmetic                             *
+******************************************************************************/
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmax(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+   __nv_bfloat16 val;
+   asm( "{ max.bf16 %0,%1,%2;\n}"
+        :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+   return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmin(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+    asm( "{ min.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmax_nan(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+    asm( "{ max.NaN.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hmin_nan(const __nv_bfloat16 a, const __nv_bfloat16 b)
+{
+    __nv_bfloat16 val;
+    asm( "{ min.NaN.bf16 %0,%1,%2;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat16 __hfma_relu(const __nv_bfloat16 a, const __nv_bfloat16 b, const __nv_bfloat16 c)
+{
+    __nv_bfloat16 val;
+    asm( "{ fma.rn.relu.bf16 %0,%1,%2,%3;\n}"
+         :"=h"(__BFLOAT16_TO_US(val)) : "h"(__BFLOAT16_TO_CUS(a)),"h"(__BFLOAT16_TO_CUS(b)),"h"(__BFLOAT16_TO_CUS(c)));
+    return val;
+}
+/******************************************************************************
+*                            __nv_bfloat162 arithmetic                             *
+******************************************************************************/
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+    asm( "{ max.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+    asm( "{ min.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmax2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+    asm( "{ max.NaN.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hmin2_nan(const __nv_bfloat162 a, const __nv_bfloat162 b)
+{
+    __nv_bfloat162 val;
+    asm( "{ min.NaN.bf16x2 %0,%1,%2;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)));
+    return val;
+}
+__CUDA_BF16_DECL__ __nv_bfloat162 __hfma2_relu(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    __nv_bfloat162 val;
+    asm( "{ fma.rn.relu.bf16x2 %0,%1,%2,%3;\n}"
+         :"=r"(__BFLOAT162_TO_UI(val)) : "r"(__BFLOAT162_TO_CUI(a)),"r"(__BFLOAT162_TO_CUI(b)),"r"(__BFLOAT162_TO_CUI(c)));
+    return val;
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat162 __hcmadd(const __nv_bfloat162 a, const __nv_bfloat162 b, const __nv_bfloat162 c)
+{
+    // fast version of complex multiply-accumulate
+    // (a.re, a.im) * (b.re, b.im) + (c.re, c.im)
+    // acc.re = (c.re + a.re*b.re) - a.im*b.im
+    // acc.im = (c.im + a.re*b.im) + a.im*b.re
+    __nv_bfloat16 real_tmp = __hfma(a.x, b.x, c.x);
+    __nv_bfloat16 img_tmp  = __hfma(a.x, b.y, c.y);
+    real_tmp = __hfma(__hneg(a.y), b.y, real_tmp);
+    img_tmp  = __hfma(a.y,         b.x, img_tmp);
+    return make_bfloat162(real_tmp, img_tmp);
+}
+
+
+/* Define __PTR for atomicAdd prototypes below, undef after done */
+#if (defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)
+#define __PTR   "l"
+#else
+#define __PTR   "r"
+#endif /*(defined(_MSC_VER) && defined(_WIN64)) || defined(__LP64__) || defined(__CUDACC_RTC__)*/
+
+__CUDA_BF16_DECL__ __nv_bfloat162 atomicAdd(__nv_bfloat162 *const address, const __nv_bfloat162 val)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat162 r;
+    asm volatile ("{ atom.add.noftz.bf16x2 %0,[%1],%2; }\n"
+                  : "=r"(__BFLOAT162_TO_UI(r)) : __PTR(address), "r"(__BFLOAT162_TO_CUI(val))
+                  : "memory");
+   return r;
+#else
+    unsigned int* address_as_uint = (unsigned int*)address;
+    unsigned int old = *address_as_uint, assumed;
+    do {
+        assumed = old;
+        __nv_bfloat162 new_val = __hadd2(val, *(__nv_bfloat162*)&assumed);
+        old = atomicCAS(address_as_uint, assumed, *(unsigned int*)&new_val);
+    } while (assumed != old);
+    return *(__nv_bfloat162*)&old;
+#endif
+}
+
+__CUDA_BF16_DECL__ __nv_bfloat16 atomicAdd(__nv_bfloat16 *const address, const __nv_bfloat16 val)
+{
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)
+    __nv_bfloat16 r;
+    asm volatile ("{ atom.add.noftz.bf16 %0,[%1],%2; }\n"
+                  : "=h"(__BFLOAT16_TO_US(r))
+                  : __PTR(address), "h"(__BFLOAT16_TO_CUS(val))
+                  : "memory");
+   return r;
+#else
+    unsigned short int* address_as_us = (unsigned short int*)address;
+    unsigned short int old = *address_as_us, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_us, assumed,
+            __bfloat16_as_ushort(__hadd(val, __ushort_as_bfloat16(assumed))));
+    } while (assumed != old);
+    return __ushort_as_bfloat16(old);
+#endif
+}
+
+#undef __PTR
+#undef __CUDA_BF16_DECL__
+#endif /* defined(__CUDACC__) && (__CUDA_ARCH__ >= 800 || !defined(__CUDA_ARCH__)) */
+#endif /* defined(__cplusplus) */
+
+#undef __BINARY_OP_BFLOAT162_MACRO
+#undef __BINARY_OP_BFLOAT16_MACRO
+
+#undef __CUDA_HOSTDEVICE_BF16_DECL__
+#undef __CUDA_BF16_DECL__
+
+/* Define first-class types "nv_bfloat16" and "nv_bfloat162", unless user specifies otherwise via "#define CUDA_NO_BFLOAT16" */
+/* C cannot ever have these types defined here, because __nv_bfloat16 and __nv_bfloat162 are C++ classes */
+#if defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16)
+typedef __nv_bfloat16  nv_bfloat16;
+typedef __nv_bfloat162 nv_bfloat162;
+
+#endif /* defined(__cplusplus) && !defined(CUDA_NO_BFLOAT16) */
+ 
+#if defined(__CPP_VERSION_AT_LEAST_11_BF16)
+#undef __CPP_VERSION_AT_LEAST_11_BF16
+#endif /* defined(__CPP_VERSION_AT_LEAST_11_BF16) */
+
+#endif /* end of include guard: __CUDA_BF16_HPP__ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h
new file mode 100644
index 0000000000000000000000000000000000000000..ffe55709f8ccdebf7341180f043006b68c08e104
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_occupancy.h
@@ -0,0 +1,1958 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/**
+ * CUDA Occupancy Calculator
+ *
+ * NAME
+ *
+ *   cudaOccMaxActiveBlocksPerMultiprocessor,
+ *   cudaOccMaxPotentialOccupancyBlockSize,
+ *   cudaOccMaxPotentialOccupancyBlockSizeVariableSMem
+ *   cudaOccAvailableDynamicSMemPerBlock
+ *
+ * DESCRIPTION
+ *
+ *   The CUDA occupancy calculator provides a standalone, programmatical
+ *   interface to compute the occupancy of a function on a device. It can also
+ *   provide occupancy-oriented launch configuration suggestions.
+ *
+ *   The function and device are defined by the user through
+ *   cudaOccFuncAttributes, cudaOccDeviceProp, and cudaOccDeviceState
+ *   structures. All APIs require all 3 of them.
+ *
+ *   See the structure definition for more details about the device / function
+ *   descriptors.
+ *
+ *   See each API's prototype for API usage.
+ *
+ * COMPATIBILITY
+ *
+ *   The occupancy calculator will be updated on each major CUDA toolkit
+ *   release. It does not provide forward compatibility, i.e. new hardwares
+ *   released after this implementation's release will not be supported.
+ *
+ * NOTE
+ *
+ *   If there is access to CUDA runtime, and the sole intent is to calculate
+ *   occupancy related values on one of the accessible CUDA devices, using CUDA
+ *   runtime's occupancy calculation APIs is recommended.
+ *
+ */
+
+#ifndef __cuda_occupancy_h__
+#define __cuda_occupancy_h__
+
+#include <stddef.h>
+#include <limits.h>
+#include <string.h>
+
+
+// __OCC_INLINE will be undefined at the end of this header
+//
+#ifdef __CUDACC__
+#define __OCC_INLINE inline __host__ __device__
+#elif defined _MSC_VER
+#define __OCC_INLINE __inline
+#else // GNUCC assumed
+#define __OCC_INLINE inline
+#endif
+
+enum cudaOccError_enum {
+    CUDA_OCC_SUCCESS              = 0,  // no error encountered
+    CUDA_OCC_ERROR_INVALID_INPUT  = 1,  // input parameter is invalid
+    CUDA_OCC_ERROR_UNKNOWN_DEVICE = 2,  // requested device is not supported in
+                                        // current implementation or device is
+                                        // invalid
+};
+typedef enum cudaOccError_enum       cudaOccError;
+
+typedef struct cudaOccResult         cudaOccResult;
+typedef struct cudaOccDeviceProp     cudaOccDeviceProp;
+typedef struct cudaOccFuncAttributes cudaOccFuncAttributes;
+typedef struct cudaOccDeviceState    cudaOccDeviceState;
+
+/**
+ * The CUDA occupancy calculator computes the occupancy of the function
+ * described by attributes with the given block size (blockSize), static device
+ * properties (properties), dynamic device states (states) and per-block dynamic
+ * shared memory allocation (dynamicSMemSize) in bytes, and output it through
+ * result along with other useful information. The occupancy is computed in
+ * terms of the maximum number of active blocks per multiprocessor. The user can
+ * then convert it to other metrics, such as number of active warps.
+ *
+ * RETURN VALUE
+ *
+ * The occupancy and related information is returned through result.
+ *
+ * If result->activeBlocksPerMultiprocessor is 0, then the given parameter
+ * combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ */
+static __OCC_INLINE
+cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
+    cudaOccResult               *result,           // out
+    const cudaOccDeviceProp     *properties,       // in
+    const cudaOccFuncAttributes *attributes,       // in
+    const cudaOccDeviceState    *state,            // in
+    int                          blockSize,        // in
+    size_t                       dynamicSmemSize); // in
+
+/**
+ * The CUDA launch configurator C API suggests a grid / block size pair (in
+ * minGridSize and blockSize) that achieves the best potential occupancy
+ * (i.e. maximum number of active warps with the smallest number of blocks) for
+ * the given function described by attributes, on a device described by
+ * properties with settings in state.
+ *
+ * If per-block dynamic shared memory allocation is not needed, the user should
+ * leave both blockSizeToDynamicSMemSize and dynamicSMemSize as 0.
+ *
+ * If per-block dynamic shared memory allocation is needed, then if the dynamic
+ * shared memory size is constant regardless of block size, the size should be
+ * passed through dynamicSMemSize, and blockSizeToDynamicSMemSize should be
+ * NULL.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with different
+ * block sizes, the user needs to provide a pointer to an unary function through
+ * blockSizeToDynamicSMemSize that computes the dynamic shared memory needed by
+ * a block of the function for any given block size. dynamicSMemSize is
+ * ignored. An example signature is:
+ *
+ *    // Take block size, returns dynamic shared memory needed
+ *    size_t blockToSmem(int blockSize);
+ *
+ * RETURN VALUE
+ *
+ * The suggested block size and the minimum number of blocks needed to achieve
+ * the maximum occupancy are returned through blockSize and minGridSize.
+ *
+ * If *blockSize is 0, then the given combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+static __OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,      // out
+    int                         *blockSize,        // out
+    const cudaOccDeviceProp     *properties,       // in
+    const cudaOccFuncAttributes *attributes,       // in
+    const cudaOccDeviceState    *state,            // in
+    size_t                     (*blockSizeToDynamicSMemSize)(int), // in
+    size_t                       dynamicSMemSize); // in
+
+/**
+ * The CUDA launch configurator C++ API suggests a grid / block size pair (in
+ * minGridSize and blockSize) that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number of blocks)
+ * for the given function described by attributes, on a device described by
+ * properties with settings in state.
+ *
+ * If per-block dynamic shared memory allocation is 0 or constant regardless of
+ * block size, the user can use cudaOccMaxPotentialOccupancyBlockSize to
+ * configure the launch. A constant dynamic shared memory allocation size in
+ * bytes can be passed through dynamicSMemSize.
+ *
+ * Otherwise, if the per-block dynamic shared memory size varies with different
+ * block sizes, the user needs to use
+ * cudaOccMaxPotentialOccupancyBlockSizeVariableSmem instead, and provide a
+ * functor / pointer to an unary function (blockSizeToDynamicSMemSize) that
+ * computes the dynamic shared memory needed by func for any given block
+ * size. An example signature is:
+ *
+ *  // Take block size, returns per-block dynamic shared memory needed
+ *  size_t blockToSmem(int blockSize);
+ *
+ * RETURN VALUE
+ *
+ * The suggested block size and the minimum number of blocks needed to achieve
+ * the maximum occupancy are returned through blockSize and minGridSize.
+ *
+ * If *blockSize is 0, then the given combination cannot run on the device.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+
+#if defined(__cplusplus)
+namespace {
+
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,          // out
+    int                         *blockSize,            // out
+    const cudaOccDeviceProp     *properties,           // in
+    const cudaOccFuncAttributes *attributes,           // in
+    const cudaOccDeviceState    *state,                // in
+    size_t                       dynamicSMemSize = 0); // in
+
+template <typename UnaryFunction>
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
+    int                         *minGridSize,          // out
+    int                         *blockSize,            // out
+    const cudaOccDeviceProp     *properties,           // in
+    const cudaOccFuncAttributes *attributes,           // in
+    const cudaOccDeviceState    *state,                // in
+    UnaryFunction                blockSizeToDynamicSMemSize); // in
+
+} // namespace anonymous
+#endif // defined(__cplusplus)
+
+/**
+ *
+ * The CUDA dynamic shared memory calculator computes the maximum size of 
+ * per-block dynamic shared memory if we want to place numBlocks blocks
+ * on an SM.
+ *
+ * RETURN VALUE
+ *
+ * Returns in *dynamicSmemSize the maximum size of dynamic shared memory to allow 
+ * numBlocks blocks per SM.
+ *
+ * ERRORS
+ *
+ *     CUDA_OCC_ERROR_INVALID_INPUT   input parameter is invalid.
+ *     CUDA_OCC_ERROR_UNKNOWN_DEVICE  requested device is not supported in
+ *     current implementation or device is invalid
+ *
+ */
+static __OCC_INLINE
+cudaOccError cudaOccAvailableDynamicSMemPerBlock(
+    size_t                      *dynamicSmemSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                         numBlocks,
+    int                         blockSize);
+
+/**
+ * Data structures
+ *
+ * These structures are subject to change for future architecture and CUDA
+ * releases. C users should initialize the structure as {0}.
+ *
+ */
+
+/**
+ * Device descriptor
+ *
+ * This structure describes a device.
+ */
+struct cudaOccDeviceProp {
+    int    computeMajor;                // Compute capability major version
+    int    computeMinor;                // Compute capability minor
+                                        // version. None supported minor version
+                                        // may cause error
+    int    maxThreadsPerBlock;          // Maximum number of threads per block
+    int    maxThreadsPerMultiprocessor; // Maximum number of threads per SM
+                                        // i.e. (Max. number of warps) x (warp
+                                        // size)
+    int    regsPerBlock;                // Maximum number of registers per block
+    int    regsPerMultiprocessor;       // Maximum number of registers per SM
+    int    warpSize;                    // Warp size
+    size_t sharedMemPerBlock;           // Maximum shared memory size per block
+    size_t sharedMemPerMultiprocessor;  // Maximum shared memory size per SM
+    int    numSms;                      // Number of SMs available
+    size_t sharedMemPerBlockOptin;      // Maximum optin shared memory size per block
+    size_t reservedSharedMemPerBlock;   // Shared memory per block reserved by driver
+
+#ifdef __cplusplus
+    // This structure can be converted from a cudaDeviceProp structure for users
+    // that use this header in their CUDA applications.
+    //
+    // If the application have access to the CUDA Runtime API, the application
+    // can obtain the device properties of a CUDA device through
+    // cudaGetDeviceProperties, and initialize a cudaOccDeviceProp with the
+    // cudaDeviceProp structure.
+    //
+    // Example:
+    /*
+     {
+         cudaDeviceProp prop;
+
+         cudaGetDeviceProperties(&prop, ...);
+
+         cudaOccDeviceProp occProp = prop;
+
+         ...
+
+         cudaOccMaxPotentialOccupancyBlockSize(..., &occProp, ...);
+     }
+     */
+    //
+    template<typename DeviceProp>
+    __OCC_INLINE
+    cudaOccDeviceProp(const DeviceProp &props)
+    :   computeMajor                (props.major),
+        computeMinor                (props.minor),
+        maxThreadsPerBlock          (props.maxThreadsPerBlock),
+        maxThreadsPerMultiprocessor (props.maxThreadsPerMultiProcessor),
+        regsPerBlock                (props.regsPerBlock),
+        regsPerMultiprocessor       (props.regsPerMultiprocessor),
+        warpSize                    (props.warpSize),
+        sharedMemPerBlock           (props.sharedMemPerBlock),
+        sharedMemPerMultiprocessor  (props.sharedMemPerMultiprocessor),
+        numSms                      (props.multiProcessorCount),
+        sharedMemPerBlockOptin      (props.sharedMemPerBlockOptin),
+        reservedSharedMemPerBlock   (props.reservedSharedMemPerBlock)
+    {}
+
+    __OCC_INLINE
+    cudaOccDeviceProp()
+    :   computeMajor                (0),
+        computeMinor                (0),
+        maxThreadsPerBlock          (0),
+        maxThreadsPerMultiprocessor (0),
+        regsPerBlock                (0),
+        regsPerMultiprocessor       (0),
+        warpSize                    (0),
+        sharedMemPerBlock           (0),
+        sharedMemPerMultiprocessor  (0),
+        numSms                      (0),
+        sharedMemPerBlockOptin      (0),
+        reservedSharedMemPerBlock   (0)
+    {}
+#endif // __cplusplus
+};
+
+/**
+ * Partitioned global caching option
+ */
+typedef enum cudaOccPartitionedGCConfig_enum {
+    PARTITIONED_GC_OFF,        // Disable partitioned global caching
+    PARTITIONED_GC_ON,         // Prefer partitioned global caching
+    PARTITIONED_GC_ON_STRICT   // Force partitioned global caching
+} cudaOccPartitionedGCConfig;
+
+/**
+ * Per function opt in maximum dynamic shared memory limit
+ */
+typedef enum cudaOccFuncShmemConfig_enum {
+    FUNC_SHMEM_LIMIT_DEFAULT,   // Default shmem limit
+    FUNC_SHMEM_LIMIT_OPTIN,     // Use the optin shmem limit
+} cudaOccFuncShmemConfig;
+
+/**
+ * Function descriptor
+ *
+ * This structure describes a CUDA function.
+ */
+struct cudaOccFuncAttributes {
+    int maxThreadsPerBlock; // Maximum block size the function can work with. If
+                            // unlimited, use INT_MAX or any value greater than
+                            // or equal to maxThreadsPerBlock of the device
+    int numRegs;            // Number of registers used. When the function is
+                            // launched on device, the register count may change
+                            // due to internal tools requirements.
+    size_t sharedSizeBytes; // Number of static shared memory used
+
+    cudaOccPartitionedGCConfig partitionedGCConfig; 
+                            // Partitioned global caching is required to enable
+                            // caching on certain chips, such as sm_52
+                            // devices. Partitioned global caching can be
+                            // automatically disabled if the occupancy
+                            // requirement of the launch cannot support caching.
+                            //
+                            // To override this behavior with caching on and
+                            // calculate occupancy strictly according to the
+                            // preference, set partitionedGCConfig to
+                            // PARTITIONED_GC_ON_STRICT. This is especially
+                            // useful for experimenting and finding launch
+                            // configurations (MaxPotentialOccupancyBlockSize)
+                            // that allow global caching to take effect.
+                            //
+                            // This flag only affects the occupancy calculation.
+
+    cudaOccFuncShmemConfig shmemLimitConfig;
+                            // Certain chips like sm_70 allow a user to opt into
+                            // a higher per block limit of dynamic shared memory
+                            // This optin is performed on a per function basis
+                            // using the cuFuncSetAttribute function
+
+    size_t maxDynamicSharedSizeBytes;
+                            // User set limit on maximum dynamic shared memory
+                            // usable by the kernel
+                            // This limit is set using the cuFuncSetAttribute
+                            // function.
+
+    int numBlockBarriers;   // Number of block barriers used (default to 1)
+#ifdef __cplusplus
+    // This structure can be converted from a cudaFuncAttributes structure for
+    // users that use this header in their CUDA applications.
+    //
+    // If the application have access to the CUDA Runtime API, the application
+    // can obtain the function attributes of a CUDA kernel function through
+    // cudaFuncGetAttributes, and initialize a cudaOccFuncAttributes with the
+    // cudaFuncAttributes structure.
+    //
+    // Example:
+    /*
+      __global__ void foo() {...}
+
+      ...
+
+      {
+          cudaFuncAttributes attr;
+
+          cudaFuncGetAttributes(&attr, foo);
+
+          cudaOccFuncAttributes occAttr = attr;
+
+          ...
+
+          cudaOccMaxPotentialOccupancyBlockSize(..., &occAttr, ...);
+      }
+     */
+    //
+    template<typename FuncAttributes>
+    __OCC_INLINE
+    cudaOccFuncAttributes(const FuncAttributes &attr)
+    :   maxThreadsPerBlock  (attr.maxThreadsPerBlock),
+        numRegs             (attr.numRegs),
+        sharedSizeBytes     (attr.sharedSizeBytes),
+        partitionedGCConfig (PARTITIONED_GC_OFF),
+        shmemLimitConfig    (FUNC_SHMEM_LIMIT_OPTIN),
+        maxDynamicSharedSizeBytes (attr.maxDynamicSharedSizeBytes),
+        numBlockBarriers    (1)
+    {}
+
+    __OCC_INLINE
+    cudaOccFuncAttributes()
+    :   maxThreadsPerBlock  (0),
+        numRegs             (0),
+        sharedSizeBytes     (0),
+        partitionedGCConfig (PARTITIONED_GC_OFF),
+        shmemLimitConfig    (FUNC_SHMEM_LIMIT_DEFAULT),
+        maxDynamicSharedSizeBytes (0),
+        numBlockBarriers    (0)
+    {}
+#endif
+};
+
+typedef enum cudaOccCacheConfig_enum {
+    CACHE_PREFER_NONE   = 0x00, // no preference for shared memory or L1 (default)
+    CACHE_PREFER_SHARED = 0x01, // prefer larger shared memory and smaller L1 cache
+    CACHE_PREFER_L1     = 0x02, // prefer larger L1 cache and smaller shared memory
+    CACHE_PREFER_EQUAL  = 0x03  // prefer equal sized L1 cache and shared memory
+} cudaOccCacheConfig;
+
+typedef enum cudaOccCarveoutConfig_enum {
+    SHAREDMEM_CARVEOUT_DEFAULT       = -1,  // no preference for shared memory or L1 (default)
+    SHAREDMEM_CARVEOUT_MAX_SHARED    = 100, // prefer maximum available shared memory, minimum L1 cache
+    SHAREDMEM_CARVEOUT_MAX_L1        = 0,    // prefer maximum available L1 cache, minimum shared memory
+    SHAREDMEM_CARVEOUT_HALF          = 50   // prefer half of maximum available shared memory, with the rest as L1 cache
+} cudaOccCarveoutConfig;
+
+/**
+ * Device state descriptor
+ *
+ * This structure describes device settings that affect occupancy calculation.
+ */
+struct cudaOccDeviceState
+{
+    // Cache / shared memory split preference. Deprecated on Volta 
+    cudaOccCacheConfig cacheConfig; 
+    // Shared memory / L1 split preference. Supported on only Volta
+    int carveoutConfig;
+
+#ifdef __cplusplus
+    __OCC_INLINE
+    cudaOccDeviceState()
+    :   cacheConfig     (CACHE_PREFER_NONE),
+        carveoutConfig  (SHAREDMEM_CARVEOUT_DEFAULT)
+    {}
+#endif
+};
+
+typedef enum cudaOccLimitingFactor_enum {
+                                    // Occupancy limited due to:
+    OCC_LIMIT_WARPS         = 0x01, // - warps available
+    OCC_LIMIT_REGISTERS     = 0x02, // - registers available
+    OCC_LIMIT_SHARED_MEMORY = 0x04, // - shared memory available
+    OCC_LIMIT_BLOCKS        = 0x08, // - blocks available
+    OCC_LIMIT_BARRIERS      = 0x10  // - barrier available
+} cudaOccLimitingFactor;
+
+/**
+ * Occupancy output
+ *
+ * This structure contains occupancy calculator's output.
+ */
+struct cudaOccResult {
+    int activeBlocksPerMultiprocessor; // Occupancy
+    unsigned int limitingFactors;      // Factors that limited occupancy. A bit
+                                       // field that counts the limiting
+                                       // factors, see cudaOccLimitingFactor
+    int blockLimitRegs;                // Occupancy due to register
+                                       // usage, INT_MAX if the kernel does not
+                                       // use any register.
+    int blockLimitSharedMem;           // Occupancy due to shared memory
+                                       // usage, INT_MAX if the kernel does not
+                                       // use shared memory.
+    int blockLimitWarps;               // Occupancy due to block size limit
+    int blockLimitBlocks;              // Occupancy due to maximum number of blocks
+                                       // managable per SM
+    int blockLimitBarriers;            // Occupancy due to block barrier usage
+    int allocatedRegistersPerBlock;    // Actual number of registers allocated per
+                                       // block
+    size_t allocatedSharedMemPerBlock; // Actual size of shared memory allocated
+                                       // per block
+    cudaOccPartitionedGCConfig partitionedGCConfig;
+                                       // Report if partitioned global caching
+                                       // is actually enabled.
+};
+
+/**
+ * Partitioned global caching support
+ *
+ * See cudaOccPartitionedGlobalCachingModeSupport
+ */
+typedef enum cudaOccPartitionedGCSupport_enum {
+    PARTITIONED_GC_NOT_SUPPORTED,  // Partitioned global caching is not supported
+    PARTITIONED_GC_SUPPORTED,      // Partitioned global caching is supported
+} cudaOccPartitionedGCSupport;
+
+/**
+ * Implementation
+ */
+
+/**
+ * Max compute capability supported
+ */
+#define __CUDA_OCC_MAJOR__ 9
+#define __CUDA_OCC_MINOR__ 0
+
+//////////////////////////////////////////
+//    Mathematical Helper Functions     //
+//////////////////////////////////////////
+
+static __OCC_INLINE int __occMin(int lhs, int rhs)
+{
+    return rhs < lhs ? rhs : lhs;
+}
+
+static __OCC_INLINE int __occDivideRoundUp(int x, int y)
+{
+    return (x + (y - 1)) / y;
+}
+
+static __OCC_INLINE int __occRoundUp(int x, int y)
+{
+    return y * __occDivideRoundUp(x, y);
+}
+
+//////////////////////////////////////////
+//      Architectural Properties        //
+//////////////////////////////////////////
+
+/**
+ * Granularity of shared memory allocation
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+        case 7:
+            value = 256;
+            break;
+        case 8:
+        case 9:
+            value = 128;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Maximum number of registers per thread
+ */
+static __OCC_INLINE cudaOccError cudaOccRegAllocationMaxPerThread(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+            value = 255;
+            break;
+        case 7:
+        case 8:
+        case 9:
+            value = 256;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Granularity of register allocation
+ */
+static __OCC_INLINE cudaOccError cudaOccRegAllocationGranularity(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 6:
+        case 7:
+        case 8:
+        case 9:
+            value = 256;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Number of sub-partitions
+ */
+static __OCC_INLINE cudaOccError cudaOccSubPartitionsPerMultiprocessor(int *limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+        case 5:
+        case 7:
+        case 8:
+        case 9:
+            value = 4;
+            break;
+        case 6:
+            value = properties->computeMinor ? 4 : 2;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+
+/**
+ * Maximum number of blocks that can run simultaneously on a multiprocessor
+ */
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerMultiprocessor(int* limit, const cudaOccDeviceProp *properties)
+{
+    int value;
+
+    switch(properties->computeMajor) {
+        case 3:
+            value = 16;
+            break;
+        case 5:
+        case 6:
+            value = 32;
+            break;
+        case 7: {
+            int isTuring = properties->computeMinor == 5;
+            value = (isTuring) ? 16 : 32;
+            break;
+        }
+        case 8:
+            if (properties->computeMinor == 0) {
+                value = 32;
+            }
+            else if (properties->computeMinor == 9) {
+                value = 24;
+            }
+            else {
+                value = 16;
+            }
+            break;
+        case 9:
+            value = 32;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = value;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/** 
+ * Align up shared memory based on compute major configurations
+ */
+static __OCC_INLINE cudaOccError cudaOccAlignUpShmemSizeVoltaPlus(size_t *shMemSize, const cudaOccDeviceProp *properties)
+{
+    // Volta and Turing have shared L1 cache / shared memory, and support cache
+    // configuration to trade one for the other. These values are needed to
+    // map carveout config ratio to the next available architecture size
+    size_t size = *shMemSize;
+
+    switch (properties->computeMajor) {
+    case 7: {
+        // Turing supports 32KB and 64KB shared mem.
+        int isTuring = properties->computeMinor == 5;
+        if (isTuring) {
+            if      (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        // Volta supports 0KB, 8KB, 16KB, 32KB, 64KB, and 96KB shared mem.
+        else {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 96 * 1024) {
+                *shMemSize = 96 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        break;
+    }
+    case 8:
+        if (properties->computeMinor == 0 || properties->computeMinor == 7) {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 100 * 1024) {
+                *shMemSize = 100 * 1024;
+            }
+            else if (size <= 132 * 1024) {
+                *shMemSize = 132 * 1024;
+            }
+            else if (size <= 164 * 1024) {
+                *shMemSize = 164 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        else {
+            if      (size == 0) {
+                *shMemSize = 0;
+            }
+            else if (size <= 8 * 1024) {
+                *shMemSize = 8 * 1024;
+            }
+            else if (size <= 16 * 1024) {
+                *shMemSize = 16 * 1024;
+            }
+            else if (size <= 32 * 1024) {
+                *shMemSize = 32 * 1024;
+            }
+            else if (size <= 64 * 1024) {
+                *shMemSize = 64 * 1024;
+            }
+            else if (size <= 100 * 1024) {
+                *shMemSize = 100 * 1024;
+            }
+            else {
+                return CUDA_OCC_ERROR_INVALID_INPUT;
+            }
+        }
+        break;
+    case 9: {
+        if      (size == 0) {
+            *shMemSize = 0;
+        }
+        else if (size <= 8 * 1024) {
+            *shMemSize = 8 * 1024;
+        }
+        else if (size <= 16 * 1024) {
+            *shMemSize = 16 * 1024;
+        }
+        else if (size <= 32 * 1024) {
+            *shMemSize = 32 * 1024;
+        }
+        else if (size <= 64 * 1024) {
+            *shMemSize = 64 * 1024;
+        }
+        else if (size <= 100 * 1024) {
+            *shMemSize = 100 * 1024;
+        }
+        else if (size <= 132 * 1024) {
+            *shMemSize = 132 * 1024;
+        }
+        else if (size <= 164 * 1024) {
+            *shMemSize = 164 * 1024;
+        }
+        else if (size <= 196 * 1024) {
+            *shMemSize = 196 * 1024;
+        }
+        else if (size <= 228 * 1024) {
+            *shMemSize = 228 * 1024;
+        }
+        else {
+            return CUDA_OCC_ERROR_INVALID_INPUT;
+        }
+        break;
+    }
+    default:
+        return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Shared memory based on the new carveoutConfig API introduced with Volta
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPreferenceVoltaPlus(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    size_t preferenceShmemSize;
+
+    // CUDA 9.0 introduces a new API to set shared memory - L1 configuration on supported
+    // devices. This preference will take precedence over the older cacheConfig setting.
+    // Map cacheConfig to its effective preference value.
+    int effectivePreference = state->carveoutConfig;
+    if ((effectivePreference < SHAREDMEM_CARVEOUT_DEFAULT) || (effectivePreference > SHAREDMEM_CARVEOUT_MAX_SHARED)) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+    
+    if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
+        switch (state->cacheConfig)
+        {
+        case CACHE_PREFER_L1:
+            effectivePreference = SHAREDMEM_CARVEOUT_MAX_L1;
+            break;
+        case CACHE_PREFER_SHARED:
+            effectivePreference = SHAREDMEM_CARVEOUT_MAX_SHARED;
+            break;
+        case CACHE_PREFER_EQUAL:
+            effectivePreference = SHAREDMEM_CARVEOUT_HALF;
+            break;
+        default:
+            effectivePreference = SHAREDMEM_CARVEOUT_DEFAULT;
+            break;
+        }
+    }
+
+    if (effectivePreference == SHAREDMEM_CARVEOUT_DEFAULT) {
+        preferenceShmemSize = properties->sharedMemPerMultiprocessor;
+    }
+    else {
+        preferenceShmemSize = (size_t) (effectivePreference * properties->sharedMemPerMultiprocessor) / 100;
+    }
+
+    status = cudaOccAlignUpShmemSizeVoltaPlus(&preferenceShmemSize, properties);
+    *limit = preferenceShmemSize;
+    return status;
+}
+
+/**
+ * Shared memory based on the cacheConfig
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPreference(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    size_t bytes                          = 0;
+    size_t sharedMemPerMultiprocessorHigh = properties->sharedMemPerMultiprocessor;
+    cudaOccCacheConfig cacheConfig        = state->cacheConfig;
+
+    // Kepler has shared L1 cache / shared memory, and support cache
+    // configuration to trade one for the other. These values are needed to
+    // calculate the correct shared memory size for user requested cache
+    // configuration.
+    //
+    size_t minCacheSize                   = 16384;
+    size_t maxCacheSize                   = 49152;
+    size_t cacheAndSharedTotal            = sharedMemPerMultiprocessorHigh + minCacheSize;
+    size_t sharedMemPerMultiprocessorLow  = cacheAndSharedTotal - maxCacheSize;
+
+    switch (properties->computeMajor) {
+        case 3:
+            // Kepler supports 16KB, 32KB, or 48KB partitions for L1. The rest
+            // is shared memory.
+            //
+            switch (cacheConfig) {
+                default :
+                case CACHE_PREFER_NONE:
+                case CACHE_PREFER_SHARED:
+                    bytes = sharedMemPerMultiprocessorHigh;
+                    break;
+                case CACHE_PREFER_L1:
+                    bytes = sharedMemPerMultiprocessorLow;
+                    break;
+                case CACHE_PREFER_EQUAL:
+                    // Equal is the mid-point between high and low. It should be
+                    // equivalent to low + 16KB.
+                    //
+                    bytes = (sharedMemPerMultiprocessorHigh + sharedMemPerMultiprocessorLow) / 2;
+                    break;
+            }
+            break;
+        case 5:
+        case 6:
+            // Maxwell and Pascal have dedicated shared memory.
+            //
+            bytes = sharedMemPerMultiprocessorHigh;
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    *limit = bytes;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Shared memory based on config requested by User
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPerMultiprocessor(size_t *limit, const cudaOccDeviceProp *properties, const cudaOccDeviceState *state)
+{
+    // Volta introduces a new API that allows for shared memory carveout preference. Because it is a shared memory preference,
+    // it is handled separately from the cache config preference.
+    if (properties->computeMajor >= 7) {
+        return cudaOccSMemPreferenceVoltaPlus(limit, properties, state);
+    }
+    return cudaOccSMemPreference(limit, properties, state);
+}
+
+/**
+ * Return the per block shared memory limit based on function config
+ */
+static __OCC_INLINE cudaOccError cudaOccSMemPerBlock(size_t *limit, const cudaOccDeviceProp *properties, cudaOccFuncShmemConfig shmemLimitConfig, size_t smemPerCta)
+{
+    switch (properties->computeMajor) {
+        case 2:
+        case 3:
+        case 4:
+        case 5:
+        case 6:
+            *limit = properties->sharedMemPerBlock;
+            break;
+        case 7:
+        case 8:
+        case 9:
+            switch (shmemLimitConfig) {
+                default:
+                case FUNC_SHMEM_LIMIT_DEFAULT:
+                    *limit = properties->sharedMemPerBlock;
+                    break;
+                case FUNC_SHMEM_LIMIT_OPTIN:
+                    if (smemPerCta > properties->sharedMemPerBlock) {
+                        *limit = properties->sharedMemPerBlockOptin;
+                    }
+                    else {
+                        *limit = properties->sharedMemPerBlock;
+                    }
+                    break;
+            }
+            break;
+        default:
+            return CUDA_OCC_ERROR_UNKNOWN_DEVICE;
+    }
+
+    // Starting Ampere, CUDA driver reserves additional shared memory per block
+    if (properties->computeMajor >= 8) {
+        *limit += properties->reservedSharedMemPerBlock;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+/**
+ * Partitioned global caching mode support
+ */
+static __OCC_INLINE cudaOccError cudaOccPartitionedGlobalCachingModeSupport(cudaOccPartitionedGCSupport *limit, const cudaOccDeviceProp *properties)
+{
+    *limit = PARTITIONED_GC_NOT_SUPPORTED;
+
+    if ((properties->computeMajor == 5 && (properties->computeMinor == 2 || properties->computeMinor == 3)) ||
+        properties->computeMajor == 6) {
+        *limit = PARTITIONED_GC_SUPPORTED;
+    }
+
+    if (properties->computeMajor == 6 && properties->computeMinor == 0) {
+        *limit = PARTITIONED_GC_NOT_SUPPORTED;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+///////////////////////////////////////////////
+//            User Input Sanity              //
+///////////////////////////////////////////////
+
+static __OCC_INLINE cudaOccError cudaOccDevicePropCheck(const cudaOccDeviceProp *properties)
+{
+    // Verify device properties
+    //
+    // Each of these limits must be a positive number.
+    //
+    // Compute capacity is checked during the occupancy calculation
+    //
+    if (properties->maxThreadsPerBlock          <= 0 ||
+        properties->maxThreadsPerMultiprocessor <= 0 ||
+        properties->regsPerBlock                <= 0 ||
+        properties->regsPerMultiprocessor       <= 0 ||
+        properties->warpSize                    <= 0 ||
+        properties->sharedMemPerBlock           <= 0 ||
+        properties->sharedMemPerMultiprocessor  <= 0 ||
+        properties->numSms                      <= 0) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE cudaOccError cudaOccFuncAttributesCheck(const cudaOccFuncAttributes *attributes)
+{
+    // Verify function attributes
+    //
+    if (attributes->maxThreadsPerBlock <= 0 ||
+        attributes->numRegs < 0) {            // Compiler may choose not to use
+                                              // any register (empty kernels,
+                                              // etc.)
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE cudaOccError cudaOccDeviceStateCheck(const cudaOccDeviceState *state)
+{
+    (void)state;   // silence unused-variable warning
+    // Placeholder
+    //
+
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE cudaOccError cudaOccInputCheck(
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+
+    status = cudaOccDevicePropCheck(properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    status = cudaOccFuncAttributesCheck(attributes);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    status = cudaOccDeviceStateCheck(state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    return status;
+}
+
+///////////////////////////////////////////////
+//    Occupancy calculation Functions        //
+///////////////////////////////////////////////
+
+static __OCC_INLINE cudaOccPartitionedGCConfig cudaOccPartitionedGCExpected(
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes)
+{
+    cudaOccPartitionedGCSupport gcSupport;
+    cudaOccPartitionedGCConfig gcConfig;
+
+    cudaOccPartitionedGlobalCachingModeSupport(&gcSupport, properties);
+
+    gcConfig = attributes->partitionedGCConfig;
+
+    if (gcSupport == PARTITIONED_GC_NOT_SUPPORTED) {
+        gcConfig = PARTITIONED_GC_OFF;
+    }
+
+    return gcConfig;
+}
+
+// Warp limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMWarpsLimit(
+    int                         *limit,
+    cudaOccPartitionedGCConfig   gcConfig,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    int                          blockSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int maxWarpsPerSm;
+    int warpsAllocatedPerCTA;
+    int maxBlocks;
+    (void)attributes;   // silence unused-variable warning
+
+    if (blockSize > properties->maxThreadsPerBlock) {
+        maxBlocks = 0;
+    }
+    else {
+        maxWarpsPerSm = properties->maxThreadsPerMultiprocessor / properties->warpSize;
+        warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
+        maxBlocks = 0;
+
+        if (gcConfig != PARTITIONED_GC_OFF) {
+            int maxBlocksPerSmPartition;
+            int maxWarpsPerSmPartition;
+
+            // If partitioned global caching is on, then a CTA can only use a SM
+            // partition (a half SM), and thus a half of the warp slots
+            // available per SM
+            //
+            maxWarpsPerSmPartition  = maxWarpsPerSm / 2;
+            maxBlocksPerSmPartition = maxWarpsPerSmPartition / warpsAllocatedPerCTA;
+            maxBlocks               = maxBlocksPerSmPartition * 2;
+        }
+        // On hardware that supports partitioned global caching, each half SM is
+        // guaranteed to support at least 32 warps (maximum number of warps of a
+        // CTA), so caching will not cause 0 occupancy due to insufficient warp
+        // allocation slots.
+        //
+        else {
+            maxBlocks = maxWarpsPerSm / warpsAllocatedPerCTA;
+        }
+    }
+
+    *limit = maxBlocks;
+
+    return status;
+}
+
+// Shared memory limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMSmemLimit(
+    int                         *limit,
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                          blockSize,
+    size_t                       dynamicSmemSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int allocationGranularity;
+    size_t userSmemPreference = 0;
+    size_t totalSmemUsagePerCTA;
+    size_t maxSmemUsagePerCTA;
+    size_t smemAllocatedPerCTA;
+    size_t staticSmemSize;
+    size_t sharedMemPerMultiprocessor;
+    size_t smemLimitPerCTA;
+    int maxBlocks;
+    int dynamicSmemSizeExceeded = 0;
+    int totalSmemSizeExceeded = 0;
+    (void)blockSize;   // silence unused-variable warning
+
+    status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // Obtain the user preferred shared memory size. This setting is ignored if
+    // user requests more shared memory than preferred.
+    //
+    status = cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    staticSmemSize = attributes->sharedSizeBytes + properties->reservedSharedMemPerBlock;
+    totalSmemUsagePerCTA = staticSmemSize + dynamicSmemSize;
+    smemAllocatedPerCTA = __occRoundUp((int)totalSmemUsagePerCTA, (int)allocationGranularity);
+
+    maxSmemUsagePerCTA = staticSmemSize + attributes->maxDynamicSharedSizeBytes;
+
+    dynamicSmemSizeExceeded = 0;
+    totalSmemSizeExceeded   = 0;
+
+    // Obtain the user set maximum dynamic size if it exists
+    // If so, the current launch dynamic shared memory must not
+    // exceed the set limit
+    if (attributes->shmemLimitConfig != FUNC_SHMEM_LIMIT_DEFAULT &&
+        dynamicSmemSize > attributes->maxDynamicSharedSizeBytes) {
+        dynamicSmemSizeExceeded = 1;
+    }
+
+    status = cudaOccSMemPerBlock(&smemLimitPerCTA, properties, attributes->shmemLimitConfig, maxSmemUsagePerCTA);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    if (smemAllocatedPerCTA > smemLimitPerCTA) {
+        totalSmemSizeExceeded = 1;
+    }
+
+    if (dynamicSmemSizeExceeded || totalSmemSizeExceeded) {
+        maxBlocks = 0;
+    }
+    else {
+        // User requested shared memory limit is used as long as it is greater
+        // than the total shared memory used per CTA, i.e. as long as at least
+        // one CTA can be launched.
+        if (userSmemPreference >= smemAllocatedPerCTA) {
+            sharedMemPerMultiprocessor = userSmemPreference;
+        }
+        else {
+            // On Volta+, user requested shared memory will limit occupancy
+            // if it's less than shared memory per CTA. Otherwise, the
+            // maximum shared memory limit is used.
+            if (properties->computeMajor >= 7) {
+                sharedMemPerMultiprocessor = smemAllocatedPerCTA;
+                status = cudaOccAlignUpShmemSizeVoltaPlus(&sharedMemPerMultiprocessor, properties);
+                if (status != CUDA_OCC_SUCCESS) {
+                    return status;
+                }
+            }
+            else {
+                sharedMemPerMultiprocessor = properties->sharedMemPerMultiprocessor;
+            }
+        }
+
+        if (smemAllocatedPerCTA > 0) {
+            maxBlocks = (int)(sharedMemPerMultiprocessor / smemAllocatedPerCTA);
+        }
+        else {
+            maxBlocks = INT_MAX;
+        }
+    }
+
+    result->allocatedSharedMemPerBlock = smemAllocatedPerCTA;
+
+    *limit = maxBlocks;
+
+    return status;
+}
+
+static __OCC_INLINE
+cudaOccError cudaOccMaxBlocksPerSMRegsLimit(
+    int                         *limit,
+    cudaOccPartitionedGCConfig  *gcConfig,
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    int                          blockSize)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int allocationGranularity;
+    int warpsAllocatedPerCTA;
+    int regsAllocatedPerCTA;
+    int regsAssumedPerCTA;
+    int regsPerWarp;
+    int regsAllocatedPerWarp;
+    int numSubPartitions;
+    int numRegsPerSubPartition;
+    int numWarpsPerSubPartition;
+    int numWarpsPerSM;
+    int maxBlocks;
+    int maxRegsPerThread;
+
+    status = cudaOccRegAllocationGranularity(
+        &allocationGranularity,
+        properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    status = cudaOccRegAllocationMaxPerThread(
+        &maxRegsPerThread,
+        properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    status = cudaOccSubPartitionsPerMultiprocessor(&numSubPartitions, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    warpsAllocatedPerCTA = __occDivideRoundUp(blockSize, properties->warpSize);
+
+    // GPUs of compute capability 2.x and higher allocate registers to warps
+    //
+    // Number of regs per warp is regs per thread x warp size, rounded up to
+    // register allocation granularity
+    //
+    regsPerWarp          = attributes->numRegs * properties->warpSize;
+    regsAllocatedPerWarp = __occRoundUp(regsPerWarp, allocationGranularity);
+    regsAllocatedPerCTA  = regsAllocatedPerWarp * warpsAllocatedPerCTA;
+
+    // Hardware verifies if a launch fits the per-CTA register limit. For
+    // historical reasons, the verification logic assumes register
+    // allocations are made to all partitions simultaneously. Therefore, to
+    // simulate the hardware check, the warp allocation needs to be rounded
+    // up to the number of partitions.
+    //
+    regsAssumedPerCTA = regsAllocatedPerWarp * __occRoundUp(warpsAllocatedPerCTA, numSubPartitions);
+
+    if (properties->regsPerBlock < regsAssumedPerCTA ||   // Hardware check
+        properties->regsPerBlock < regsAllocatedPerCTA || // Software check
+        attributes->numRegs > maxRegsPerThread) {         // Per thread limit check
+        maxBlocks = 0;
+    }
+    else {
+        if (regsAllocatedPerWarp > 0) {
+            // Registers are allocated in each sub-partition. The max number
+            // of warps that can fit on an SM is equal to the max number of
+            // warps per sub-partition x number of sub-partitions.
+            //
+            numRegsPerSubPartition  = properties->regsPerMultiprocessor / numSubPartitions;
+            numWarpsPerSubPartition = numRegsPerSubPartition / regsAllocatedPerWarp;
+
+            maxBlocks = 0;
+
+            if (*gcConfig != PARTITIONED_GC_OFF) {
+                int numSubPartitionsPerSmPartition;
+                int numWarpsPerSmPartition;
+                int maxBlocksPerSmPartition;
+
+                // If partitioned global caching is on, then a CTA can only
+                // use a half SM, and thus a half of the registers available
+                // per SM
+                //
+                numSubPartitionsPerSmPartition = numSubPartitions / 2;
+                numWarpsPerSmPartition         = numWarpsPerSubPartition * numSubPartitionsPerSmPartition;
+                maxBlocksPerSmPartition        = numWarpsPerSmPartition / warpsAllocatedPerCTA;
+                maxBlocks                      = maxBlocksPerSmPartition * 2;
+            }
+
+            // Try again if partitioned global caching is not enabled, or if
+            // the CTA cannot fit on the SM with caching on (maxBlocks == 0).  In the latter
+            // case, the device will automatically turn off caching, except
+            // if the user forces enablement via PARTITIONED_GC_ON_STRICT to calculate
+            // occupancy and launch configuration.
+            //
+            if (maxBlocks == 0 && *gcConfig != PARTITIONED_GC_ON_STRICT) {
+               // In case *gcConfig was PARTITIONED_GC_ON flip it OFF since
+               // this is what it will be if we spread CTA across partitions.
+               //
+               *gcConfig = PARTITIONED_GC_OFF;
+               numWarpsPerSM = numWarpsPerSubPartition * numSubPartitions;
+               maxBlocks     = numWarpsPerSM / warpsAllocatedPerCTA;
+            }
+        }
+        else {
+            maxBlocks = INT_MAX;
+        }
+    }
+
+
+    result->allocatedRegistersPerBlock = regsAllocatedPerCTA;
+
+    *limit = maxBlocks;
+
+    return status;
+}
+
+// Barrier limit
+//
+static __OCC_INLINE cudaOccError cudaOccMaxBlocksPerSMBlockBarrierLimit(
+    int                         *limit,
+    int                          ctaLimitBlocks,
+    const cudaOccFuncAttributes *attributes)
+{
+    cudaOccError status = CUDA_OCC_SUCCESS;
+    int numBarriersAvailable = ctaLimitBlocks * 2;
+    int numBarriersUsed = attributes->numBlockBarriers;
+    int maxBlocks = INT_MAX;
+
+    if (numBarriersUsed) {
+        maxBlocks = numBarriersAvailable / numBarriersUsed;
+    }
+
+    *limit = maxBlocks;
+
+    return status;
+}
+
+///////////////////////////////////
+//      API Implementations      //
+///////////////////////////////////
+
+static __OCC_INLINE
+cudaOccError cudaOccMaxActiveBlocksPerMultiprocessor(
+    cudaOccResult               *result,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                          blockSize,
+    size_t                       dynamicSmemSize)
+{
+    cudaOccError status          = CUDA_OCC_SUCCESS;
+    int          ctaLimitWarps   = 0;
+    int          ctaLimitBlocks  = 0;
+    int          ctaLimitSMem    = 0;
+    int          ctaLimitRegs    = 0;
+    int          ctaLimitBars    = 0;
+    int          ctaLimit        = 0;
+    unsigned int limitingFactors = 0;
+    
+    cudaOccPartitionedGCConfig gcConfig = PARTITIONED_GC_OFF;
+
+    if (!result || !properties || !attributes || !state || blockSize <= 0) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    ///////////////////////////
+    // Initialization
+    ///////////////////////////
+
+    gcConfig = cudaOccPartitionedGCExpected(properties, attributes);
+
+    ///////////////////////////
+    // Compute occupancy
+    ///////////////////////////
+
+    // Limits due to registers/SM
+    // Also compute if partitioned global caching has to be turned off
+    //
+    status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegs, &gcConfig, result, properties, attributes, blockSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // SMs on GP100 (6.0) have 2 subpartitions, while those on GP10x have 4.
+    // As a result, an SM on GP100 may be able to run more CTAs than the one on GP10x.
+    // For forward compatibility within Pascal family, if a function cannot run on GP10x (maxBlock == 0),
+    // we do not let it run on any Pascal processor, even though it may be able to run on GP100.
+    // Therefore, we check the occupancy on GP10x when it can run on GP100
+    //
+    if (properties->computeMajor == 6 && properties->computeMinor == 0 && ctaLimitRegs) {
+        cudaOccDeviceProp propertiesGP10x;
+        cudaOccPartitionedGCConfig gcConfigGP10x = gcConfig;
+        int ctaLimitRegsGP10x = 0;
+
+        // Set up properties for GP10x
+        memcpy(&propertiesGP10x, properties, sizeof(propertiesGP10x));
+        propertiesGP10x.computeMinor = 1;
+
+        status = cudaOccMaxBlocksPerSMRegsLimit(&ctaLimitRegsGP10x, &gcConfigGP10x, result, &propertiesGP10x, attributes, blockSize);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+
+        if (ctaLimitRegsGP10x == 0) {
+            ctaLimitRegs = 0;
+        }
+    }
+
+    // Limits due to warps/SM
+    //
+    status = cudaOccMaxBlocksPerSMWarpsLimit(&ctaLimitWarps, gcConfig, properties, attributes, blockSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // Limits due to blocks/SM
+    //
+    status = cudaOccMaxBlocksPerMultiprocessor(&ctaLimitBlocks, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // Limits due to shared memory/SM
+    //
+    status = cudaOccMaxBlocksPerSMSmemLimit(&ctaLimitSMem, result, properties, attributes, state, blockSize, dynamicSmemSize);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    ///////////////////////////
+    // Overall occupancy
+    ///////////////////////////
+
+    // Overall limit is min() of limits due to above reasons
+    //
+    ctaLimit = __occMin(ctaLimitRegs, __occMin(ctaLimitSMem, __occMin(ctaLimitWarps, ctaLimitBlocks)));
+
+    // Determine occupancy limiting factors
+    //
+    if (ctaLimit == ctaLimitWarps) {
+        limitingFactors |= OCC_LIMIT_WARPS;
+    }
+    if (ctaLimit == ctaLimitRegs) {
+        limitingFactors |= OCC_LIMIT_REGISTERS;
+    }
+    if (ctaLimit == ctaLimitSMem) {
+        limitingFactors |= OCC_LIMIT_SHARED_MEMORY;
+    }
+    if (ctaLimit == ctaLimitBlocks) {
+        limitingFactors |= OCC_LIMIT_BLOCKS;
+    }
+
+    // For Hopper onwards compute the limits to occupancy based on block barrier count
+    //
+    if (properties->computeMajor >= 9 && attributes->numBlockBarriers > 0) {
+        // Limits due to barrier/SM
+        //
+        status = cudaOccMaxBlocksPerSMBlockBarrierLimit(&ctaLimitBars, ctaLimitBlocks, attributes);
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+
+        // Recompute overall limit based on barrier/SM
+        //
+        ctaLimit = __occMin(ctaLimitBars, ctaLimit);
+
+        // Determine if this is occupancy limiting factor
+        //
+        if (ctaLimit == ctaLimitBars) {
+            limitingFactors |= OCC_LIMIT_BARRIERS;
+        }
+    }
+    else {
+        ctaLimitBars = INT_MAX;
+    }
+
+    // Fill in the return values
+    //
+    result->limitingFactors = limitingFactors;
+
+    result->blockLimitRegs      = ctaLimitRegs;
+    result->blockLimitSharedMem = ctaLimitSMem;
+    result->blockLimitWarps     = ctaLimitWarps;
+    result->blockLimitBlocks    = ctaLimitBlocks;
+    result->blockLimitBarriers  = ctaLimitBars;
+    result->partitionedGCConfig = gcConfig;
+
+    // Final occupancy
+    result->activeBlocksPerMultiprocessor = ctaLimit;
+
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE
+cudaOccError cudaOccAvailableDynamicSMemPerBlock(
+    size_t                      *bytesAvailable,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    int                         numBlocks,
+    int                         blockSize)
+{
+    int allocationGranularity;
+    size_t smemLimitPerBlock;
+    size_t smemAvailableForDynamic;
+    size_t userSmemPreference = 0;
+    size_t sharedMemPerMultiprocessor;
+    cudaOccResult result;
+    cudaOccError status = CUDA_OCC_SUCCESS;
+
+    if (numBlocks <= 0)
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+
+    // First compute occupancy of potential kernel launch.
+    //
+    status = cudaOccMaxActiveBlocksPerMultiprocessor(&result, properties, attributes, state, blockSize, 0);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+    // Check if occupancy is achievable given user requested number of blocks. 
+    //
+    if (result.activeBlocksPerMultiprocessor < numBlocks) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    status = cudaOccSMemAllocationGranularity(&allocationGranularity, properties);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // Return the per block shared memory limit based on function config.
+    //
+    status = cudaOccSMemPerBlock(&smemLimitPerBlock, properties, attributes->shmemLimitConfig, properties->sharedMemPerMultiprocessor);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    // If there is only a single block needed per SM, then the user preference can be ignored and the fully SW
+    // limit is allowed to be used as shared memory otherwise if more than one block is needed, then the user
+    // preference sets the total limit of available shared memory.
+    //
+    cudaOccSMemPerMultiprocessor(&userSmemPreference, properties, state);
+    if (numBlocks == 1) {
+        sharedMemPerMultiprocessor = smemLimitPerBlock;
+    }
+    else {
+        if (!userSmemPreference) {
+            userSmemPreference = 1 ;
+            status = cudaOccAlignUpShmemSizeVoltaPlus(&userSmemPreference, properties);
+            if (status != CUDA_OCC_SUCCESS) {
+                return status;
+            }
+        }
+        sharedMemPerMultiprocessor = userSmemPreference;
+    }
+
+    // Compute total shared memory available per SM
+    //
+    smemAvailableForDynamic =  sharedMemPerMultiprocessor / numBlocks;
+    smemAvailableForDynamic = (smemAvailableForDynamic / allocationGranularity) * allocationGranularity;
+
+    // Cap shared memory
+    //
+    if (smemAvailableForDynamic > smemLimitPerBlock) {
+        smemAvailableForDynamic = smemLimitPerBlock;
+    }
+
+    // Now compute dynamic shared memory size
+    smemAvailableForDynamic = smemAvailableForDynamic - attributes->sharedSizeBytes; 
+
+    // Cap computed dynamic SM by user requested limit specified via cuFuncSetAttribute()
+    //
+    if (smemAvailableForDynamic > attributes->maxDynamicSharedSizeBytes)
+        smemAvailableForDynamic = attributes->maxDynamicSharedSizeBytes;
+
+    *bytesAvailable = smemAvailableForDynamic;
+    return CUDA_OCC_SUCCESS;
+}
+
+static __OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    size_t                     (*blockSizeToDynamicSMemSize)(int),
+    size_t                       dynamicSMemSize)
+{
+    cudaOccError  status = CUDA_OCC_SUCCESS;
+    cudaOccResult result;
+
+    // Limits
+    int occupancyLimit;
+    int granularity;
+    int blockSizeLimit;
+
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    if (!minGridSize || !blockSize || !properties || !attributes || !state) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+
+    occupancyLimit = properties->maxThreadsPerMultiprocessor;
+    granularity    = properties->warpSize;
+
+    blockSizeLimit        = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
+    blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
+
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
+
+        // Ignore dynamicSMemSize if the user provides a mapping
+        //
+        if (blockSizeToDynamicSMemSize) {
+            dynamicSMemSize = (*blockSizeToDynamicSMemSize)(blockSizeToTry);
+        }
+
+        status = cudaOccMaxActiveBlocksPerMultiprocessor(
+            &result,
+            properties,
+            attributes,
+            state,
+            blockSizeToTry,
+            dynamicSMemSize);
+
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+
+        occupancyInBlocks = result.activeBlocksPerMultiprocessor;
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * properties->numSms;
+    *blockSize = maxBlockSize;
+
+    return status;
+}
+
+
+#if defined(__cplusplus)
+
+namespace {
+
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSize(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    size_t                       dynamicSMemSize)
+{
+    return cudaOccMaxPotentialOccupancyBlockSize(
+        minGridSize,
+        blockSize,
+        properties,
+        attributes,
+        state,
+        NULL,
+        dynamicSMemSize);
+}
+
+template <typename UnaryFunction>
+__OCC_INLINE
+cudaOccError cudaOccMaxPotentialOccupancyBlockSizeVariableSMem(
+    int                         *minGridSize,
+    int                         *blockSize,
+    const cudaOccDeviceProp     *properties,
+    const cudaOccFuncAttributes *attributes,
+    const cudaOccDeviceState    *state,
+    UnaryFunction                blockSizeToDynamicSMemSize)
+{
+    cudaOccError  status = CUDA_OCC_SUCCESS;
+    cudaOccResult result;
+
+    // Limits
+    int occupancyLimit;
+    int granularity;
+    int blockSizeLimit;
+
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+    size_t dynamicSMemSize;
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    if (!minGridSize || !blockSize || !properties || !attributes || !state) {
+        return CUDA_OCC_ERROR_INVALID_INPUT;
+    }
+
+    status = cudaOccInputCheck(properties, attributes, state);
+    if (status != CUDA_OCC_SUCCESS) {
+        return status;
+    }
+
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+
+    occupancyLimit = properties->maxThreadsPerMultiprocessor;
+    granularity    = properties->warpSize;
+    blockSizeLimit        = __occMin(properties->maxThreadsPerBlock, attributes->maxThreadsPerBlock);
+    blockSizeLimitAligned = __occRoundUp(blockSizeLimit, granularity);
+
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        blockSizeToTry = __occMin(blockSizeLimit, blockSizeToTryAligned);
+
+        dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
+
+        status = cudaOccMaxActiveBlocksPerMultiprocessor(
+            &result,
+            properties,
+            attributes,
+            state,
+            blockSizeToTry,
+            dynamicSMemSize);
+
+        if (status != CUDA_OCC_SUCCESS) {
+            return status;
+        }
+
+        occupancyInBlocks = result.activeBlocksPerMultiprocessor;
+
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * properties->numSms;
+    *blockSize = maxBlockSize;
+
+    return status;
+}
+
+} // namespace anonymous
+
+#endif /*__cplusplus */
+
+#undef __OCC_INLINE
+
+#endif /*__cuda_occupancy_h__*/
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h
new file mode 100644
index 0000000000000000000000000000000000000000..46bc89e4499576f1ae58848cd8684ba3e32420cf
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_pipeline.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDA_PIPELINE_H_
+# define _CUDA_PIPELINE_H_
+
+# include "cuda_pipeline_primitives.h"
+
+# if !defined(_CUDA_PIPELINE_CPLUSPLUS_11_OR_LATER)
+#  error This file requires compiler support for the ISO C++ 2011 standard. This support must be enabled with the \
+         -std=c++11 compiler option.
+# endif
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+#  include "cuda_awbarrier.h"
+# endif
+
+// Integration with libcu++'s cuda::barrier<cuda::thread_scope_block>.
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+#  if defined(_LIBCUDACXX_CUDA_ABI_VERSION)
+#   define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION _LIBCUDACXX_CUDA_ABI_VERSION
+#  else
+#   define _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION 4
+#  endif
+
+#  define _LIBCUDACXX_PIPELINE_CONCAT(X, Y) X ## Y
+#  define _LIBCUDACXX_PIPELINE_CONCAT2(X, Y) _LIBCUDACXX_PIPELINE_CONCAT(X, Y)
+#  define _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE _LIBCUDACXX_PIPELINE_CONCAT2(__, _LIBCUDACXX_PIPELINE_ASSUMED_ABI_VERSION)
+
+namespace cuda { inline namespace _LIBCUDACXX_PIPELINE_INLINE_NAMESPACE {
+    struct __block_scope_barrier_base;
+}}
+
+# endif
+
+_CUDA_PIPELINE_BEGIN_NAMESPACE
+
+template<size_t N, typename T>
+_CUDA_PIPELINE_QUALIFIER
+auto segment(T* ptr) -> T(*)[N];
+
+class pipeline {
+public:
+    pipeline(const pipeline&) = delete;
+    pipeline(pipeline&&) = delete;
+    pipeline& operator=(const pipeline&) = delete;
+    pipeline& operator=(pipeline&&) = delete;
+
+    _CUDA_PIPELINE_QUALIFIER pipeline();
+    _CUDA_PIPELINE_QUALIFIER size_t commit();
+    _CUDA_PIPELINE_QUALIFIER void commit_and_wait();
+    _CUDA_PIPELINE_QUALIFIER void wait(size_t batch);
+    template<unsigned N>
+    _CUDA_PIPELINE_QUALIFIER void wait_prior();
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+    _CUDA_PIPELINE_QUALIFIER void arrive_on(awbarrier& barrier);
+    _CUDA_PIPELINE_QUALIFIER void arrive_on(cuda::__block_scope_barrier_base& barrier);
+# endif
+
+private:
+    size_t current_batch;
+};
+
+template<class T>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T& dst, const T& src, pipeline& pipe);
+
+template<class T, size_t DstN, size_t SrcN>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe);
+
+template<size_t N, typename T>
+_CUDA_PIPELINE_QUALIFIER
+auto segment(T* ptr) -> T(*)[N]
+{
+    return (T(*)[N])ptr;
+}
+
+_CUDA_PIPELINE_QUALIFIER
+pipeline::pipeline()
+    : current_batch(0)
+{
+}
+
+_CUDA_PIPELINE_QUALIFIER
+size_t pipeline::commit()
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_commit();
+    return this->current_batch++;
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::commit_and_wait()
+{
+    (void)pipeline::commit();
+    pipeline::wait_prior<0>();
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::wait(size_t batch)
+{
+    const size_t prior = this->current_batch > batch ? this->current_batch - batch : 0;
+
+    switch (prior) {
+    case  0 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<0>(); break;
+    case  1 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<1>(); break;
+    case  2 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<2>(); break;
+    case  3 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<3>(); break;
+    case  4 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<4>(); break;
+    case  5 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<5>(); break;
+    case  6 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<6>(); break;
+    case  7 : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<7>(); break;
+    default : _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<8>(); break;
+    }
+}
+
+template<unsigned N>
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::wait_prior()
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_wait_prior<N>();
+}
+
+# if defined(_CUDA_PIPELINE_ARCH_700_OR_LATER)
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::arrive_on(awbarrier& barrier)
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(&barrier.barrier);
+}
+
+_CUDA_PIPELINE_QUALIFIER
+void pipeline::arrive_on(cuda::__block_scope_barrier_base & barrier)
+{
+    _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_arrive_on(reinterpret_cast<uint64_t *>(&barrier));
+}
+# endif
+
+template<class T>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T& dst, const T& src, pipeline& pipe)
+{
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&src) & (alignof(T) - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(&dst) & (alignof(T) - 1)));
+
+    if (__is_trivially_copyable(T)) {
+        _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_relaxed<sizeof(T), alignof(T)>(
+                reinterpret_cast<void*>(&dst), reinterpret_cast<const void*>(&src));
+    } else {
+        dst = src;
+    }
+}
+
+template<class T, size_t DstN, size_t SrcN>
+_CUDA_PIPELINE_QUALIFIER
+void memcpy_async(T(*dst)[DstN], const T(*src)[SrcN], pipeline& pipe)
+{
+    constexpr size_t dst_size = sizeof(*dst);
+    constexpr size_t src_size = sizeof(*src);
+    static_assert(dst_size == 4 || dst_size == 8 || dst_size == 16, "Unsupported copy size.");
+    static_assert(src_size <= dst_size, "Source size must be less than or equal to destination size.");
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(src) & (dst_size - 1)));
+    _CUDA_PIPELINE_ASSERT(!(reinterpret_cast<uintptr_t>(dst) & (dst_size - 1)));
+
+    if (__is_trivially_copyable(T)) {
+        _CUDA_PIPELINE_INTERNAL_NAMESPACE::pipeline_copy_strict<sizeof(*dst), sizeof(*src)>(
+                reinterpret_cast<void*>(*dst), reinterpret_cast<const void*>(*src));
+    } else {
+        for (size_t i = 0; i < DstN; ++i) {
+            (*dst)[i] = (i < SrcN) ? (*src)[i] : T();
+        }
+    }
+}
+
+_CUDA_PIPELINE_END_NAMESPACE
+
+#endif /* !_CUDA_PIPELINE_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d297bb6c4140f2056618cedd9b34bdc42cd6367
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_runtime.h
@@ -0,0 +1,2725 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_RUNTIME_H__)
+#define __CUDA_RUNTIME_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__
+#endif
+
+#if !defined(__CUDACC_RTC__)
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+#pragma GCC diagnostic push
+#endif
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+#elif defined(_MSC_VER)
+#pragma warning(push)
+#pragma warning(disable: 4820)
+#endif
+#endif
+
+#ifdef __QNX__
+#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)
+typedef unsigned size_t;
+#endif
+#endif
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "crt/host_config.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "builtin_types.h"
+#include "library_types.h"
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#include "cuda_runtime_api.h"
+#include "driver_functions.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "crt/host_defines.h"
+#include "vector_functions.h"
+
+#if defined(__CUDACC__)
+
+#if defined(__CUDACC_RTC__)
+#include "nvrtc_device_runtime.h"
+#include "crt/device_functions.h"
+#include "crt/common_functions.h"
+#include "cuda_surface_types.h"
+#include "cuda_texture_types.h"
+#include "device_launch_parameters.h"
+
+#else /* !__CUDACC_RTC__ */
+#define EXCLUDE_FROM_RTC
+#include "crt/common_functions.h"
+#include "cuda_surface_types.h"
+#include "cuda_texture_types.h"
+#include "crt/device_functions.h"
+#include "device_launch_parameters.h"
+
+#if defined(__CUDACC_EXTENDED_LAMBDA__)
+#include <functional>
+#include <utility>
+struct  __device_builtin__ __nv_lambda_preheader_injection { };
+#endif /* defined(__CUDACC_EXTENDED_LAMBDA__) */
+
+#undef EXCLUDE_FROM_RTC
+#endif /* __CUDACC_RTC__ */
+
+#endif /* __CUDACC__ */
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+#if defined(__cplusplus) && !defined(__CUDACC_RTC__)
+
+#if __cplusplus >= 201103
+#include <utility>
+#endif
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * \addtogroup CUDART_HIGHLEVEL
+ * @{
+ */
+
+/**
+ *\brief Launches a device function
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x &times; \p gridDim.y
+ * &times; \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x &times;
+ * \p blockDim.y &times; \p blockDim.z) threads.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimentions
+ * \param blockDim    - Block dimentions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory (defaults to 0)
+ * \param stream      - Stream identifier (defaults to NULL)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaLaunchKernel(
+  const T *func,
+  dim3 gridDim,
+  dim3 blockDim,
+  void **args,
+  size_t sharedMem = 0,
+  cudaStream_t stream = 0
+)
+{
+    return ::cudaLaunchKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+
+#if __cplusplus >= 201103 || defined(__DOXYGEN_ONLY__)
+/**
+ * \brief Launches a CUDA function with launch-time configuration
+ *
+ * Invokes the kernel \p func on \p config->gridDim (\p config->gridDim.x
+ * &times; \p config->gridDim.y &times; \p config->gridDim.z) grid of blocks.
+ * Each block contains \p config->blockDim (\p config->blockDim.x &times;
+ * \p config->blockDim.y &times; \p config->blockDim.z) threads.
+ *
+ * \p config->dynamicSmemBytes sets the amount of dynamic shared memory that
+ * will be available to each thread block.
+ *
+ * \p config->stream specifies a stream the invocation is associated to.
+ *
+ * Configuration beyond grid and block dimensions, dynamic shared memory size,
+ * and stream can be provided with the following two fields of \p config:
+ *
+ * \p config->attrs is an array of \p config->numAttrs contiguous
+ * ::cudaLaunchAttribute elements. The value of this pointer is not considered
+ * if \p config->numAttrs is zero. However, in that case, it is recommended to
+ * set the pointer to NULL.
+ * \p config->numAttrs is the number of attributes populating the first
+ * \p config->numAttrs positions of the \p config->attrs array.
+ *
+ * The kernel arguments should be passed as arguments to this function via the
+ * \p args parameter pack.
+ *
+ * The C API version of this function, \p cudaLaunchKernelExC, is also available
+ * for pre-C++11 compilers and for use cases where the ability to pass kernel
+ * parameters via void* array is preferable.
+ *
+ * \param config - Launch configuration
+ * \param func   - Kernel to launch
+ * \param args   - Parameter pack of kernel parameters
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed,
+ * ::cudaErrorInvalidPtx,
+ * ::cudaErrorUnsupportedPtxVersion,
+ * ::cudaErrorNoKernelImageForDevice,
+ * ::cudaErrorJitCompilerNotFound,
+ * ::cudaErrorJitCompilationDisabled
+ * \note_null_stream
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * \ref ::cudaLaunchKernelExC(const cudaLaunchConfig_t *config, const void *func, void **args) "cudaLaunchKernelEx (C API)",
+ * ::cuLaunchKernelEx
+ */
+template<typename... ExpTypes, typename... ActTypes>
+static __inline__ __host__ cudaError_t cudaLaunchKernelEx(
+  const cudaLaunchConfig_t *config,
+  void (*kernel)(ExpTypes...),
+  ActTypes &&... args
+)
+{
+    return [&](ExpTypes... coercedArgs){
+        void *pArgs[] = { &coercedArgs... };
+        return ::cudaLaunchKernelExC(config, (const void *)kernel, pArgs);
+    }(std::forward<ActTypes>(args)...);
+}
+# endif
+
+/**
+ *\brief Launches a device function
+ *
+ * The function invokes kernel \p func on \p gridDim (\p gridDim.x &times; \p gridDim.y
+ * &times; \p gridDim.z) grid of blocks. Each block contains \p blockDim (\p blockDim.x &times;
+ * \p blockDim.y &times; \p blockDim.z) threads.
+ *
+ * The device on which this kernel is invoked must have a non-zero value for
+ * the device attribute ::cudaDevAttrCooperativeLaunch.
+ *
+ * The total number of blocks launched cannot exceed the maximum number of blocks per
+ * multiprocessor as returned by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor (or
+ * ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags) times the number of multiprocessors
+ * as specified by the device attribute ::cudaDevAttrMultiProcessorCount.
+ *
+ * The kernel cannot make use of CUDA dynamic parallelism.
+ *
+ * If the kernel has N parameters the \p args should point to array of N pointers.
+ * Each pointer, from <tt>args[0]</tt> to <tt>args[N - 1]</tt>, point to the region
+ * of memory from which the actual parameter will be copied.
+ *
+ * \p sharedMem sets the amount of dynamic shared memory that will be available to
+ * each thread block.
+ *
+ * \p stream specifies a stream the invocation is associated to.
+ *
+ * \param func        - Device function symbol
+ * \param gridDim     - Grid dimentions
+ * \param blockDim    - Block dimentions
+ * \param args        - Arguments
+ * \param sharedMem   - Shared memory (defaults to 0)
+ * \param stream      - Stream identifier (defaults to NULL)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidConfiguration,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorLaunchTimeout,
+ * ::cudaErrorLaunchOutOfResources,
+ * ::cudaErrorSharedObjectInitFailed
+ * \notefnerr
+ * \note_async
+ * \note_null_stream
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchCooperativeKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchCooperativeKernel (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaLaunchCooperativeKernel(
+  const T *func,
+  dim3 gridDim,
+  dim3 blockDim,
+  void **args,
+  size_t sharedMem = 0,
+  cudaStream_t stream = 0
+)
+{
+    return ::cudaLaunchCooperativeKernel((const void *)func, gridDim, blockDim, args, sharedMem, stream);
+}
+
+/**
+ * \brief \hl Creates an event object with the specified flags
+ *
+ * Creates an event object with the specified flags. Valid flags include:
+ * - ::cudaEventDefault: Default event creation flag.
+ * - ::cudaEventBlockingSync: Specifies that event should use blocking
+ *   synchronization. A host thread that uses ::cudaEventSynchronize() to wait
+ *   on an event created with this flag will block until the event actually
+ *   completes.
+ * - ::cudaEventDisableTiming: Specifies that the created event does not need
+ *   to record timing data.  Events created with this flag specified and
+ *   the ::cudaEventBlockingSync flag not specified will provide the best
+ *   performance when used with ::cudaStreamWaitEvent() and ::cudaEventQuery().
+ *
+ * \param event - Newly created event
+ * \param flags - Flags for new event
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorLaunchFailure,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaEventCreate(cudaEvent_t*) "cudaEventCreate (C API)",
+ * ::cudaEventCreateWithFlags, ::cudaEventRecord, ::cudaEventQuery,
+ * ::cudaEventSynchronize, ::cudaEventDestroy, ::cudaEventElapsedTime,
+ * ::cudaStreamWaitEvent
+ */
+static __inline__ __host__ cudaError_t cudaEventCreate(
+  cudaEvent_t  *event,
+  unsigned int  flags
+)
+{
+  return ::cudaEventCreateWithFlags(event, flags);
+}
+
+/**
+ * \brief \hl Allocates page-locked memory on the host
+ *
+ * Allocates \p size bytes of host memory that is page-locked and accessible
+ * to the device. The driver tracks the virtual memory ranges allocated with
+ * this function and automatically accelerates calls to functions such as
+ * ::cudaMemcpy(). Since the memory can be accessed directly by the device, it
+ * can be read or written with much higher bandwidth than pageable memory
+ * obtained with functions such as ::malloc(). Allocating excessive amounts of
+ * pinned memory may degrade system performance, since it reduces the amount
+ * of memory available to the system for paging. As a result, this function is
+ * best used sparingly to allocate staging areas for data exchange between host
+ * and device.
+ *
+ * The \p flags parameter enables different options to be specified that affect
+ * the allocation, as follows.
+ * - ::cudaHostAllocDefault: This flag's value is defined to be 0.
+ * - ::cudaHostAllocPortable: The memory returned by this call will be
+ * considered as pinned memory by all CUDA contexts, not just the one that
+ * performed the allocation.
+ * - ::cudaHostAllocMapped: Maps the allocation into the CUDA address space.
+ * The device pointer to the memory may be obtained by calling
+ * ::cudaHostGetDevicePointer().
+ * - ::cudaHostAllocWriteCombined: Allocates the memory as write-combined (WC).
+ * WC memory can be transferred across the PCI Express bus more quickly on some
+ * system configurations, but cannot be read efficiently by most CPUs.  WC
+ * memory is a good option for buffers that will be written by the CPU and read
+ * by the device via mapped pinned memory or host->device transfers.
+ *
+ * All of these flags are orthogonal to one another: a developer may allocate
+ * memory that is portable, mapped and/or write-combined with no restrictions.
+ *
+ * ::cudaSetDeviceFlags() must have been called with the ::cudaDeviceMapHost
+ * flag in order for the ::cudaHostAllocMapped flag to have any effect.
+ *
+ * The ::cudaHostAllocMapped flag may be specified on CUDA contexts for devices
+ * that do not support mapped pinned memory. The failure is deferred to
+ * ::cudaHostGetDevicePointer() because the memory may be mapped into other
+ * CUDA contexts via the ::cudaHostAllocPortable flag.
+ *
+ * Memory allocated by this function must be freed with ::cudaFreeHost().
+ *
+ * \param ptr   - Device pointer to allocated memory
+ * \param size  - Requested allocation size in bytes
+ * \param flags - Requested properties of allocated memory
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaSetDeviceFlags,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc
+ */
+static __inline__ __host__ cudaError_t cudaMallocHost(
+  void         **ptr,
+  size_t         size,
+  unsigned int   flags
+)
+{
+  return ::cudaHostAlloc(ptr, size, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaHostAlloc(
+  T            **ptr,
+  size_t         size,
+  unsigned int   flags
+)
+{
+  return ::cudaHostAlloc((void**)(void*)ptr, size, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaHostGetDevicePointer(
+  T            **pDevice,
+  void          *pHost,
+  unsigned int   flags
+)
+{
+  return ::cudaHostGetDevicePointer((void**)(void*)pDevice, pHost, flags);
+}
+
+/**
+ * \brief Allocates memory that will be automatically managed by the Unified Memory system
+ *
+ * Allocates \p size bytes of managed memory on the device and returns in
+ * \p *devPtr a pointer to the allocated memory. If the device doesn't support
+ * allocating managed memory, ::cudaErrorNotSupported is returned. Support
+ * for managed memory can be queried using the device attribute
+ * ::cudaDevAttrManagedMemory. The allocated memory is suitably
+ * aligned for any kind of variable. The memory is not cleared. If \p size
+ * is 0, ::cudaMallocManaged returns ::cudaErrorInvalidValue. The pointer
+ * is valid on the CPU and on all GPUs in the system that support managed memory.
+ * All accesses to this pointer must obey the Unified Memory programming model.
+ *
+ * \p flags specifies the default stream association for this allocation.
+ * \p flags must be one of ::cudaMemAttachGlobal or ::cudaMemAttachHost. The
+ * default value for \p flags is ::cudaMemAttachGlobal.
+ * If ::cudaMemAttachGlobal is specified, then this memory is accessible from
+ * any stream on any device. If ::cudaMemAttachHost is specified, then the
+ * allocation should not be accessed from devices that have a zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess; an explicit call to
+ * ::cudaStreamAttachMemAsync will be required to enable access on such devices.
+ *
+ * If the association is later changed via ::cudaStreamAttachMemAsync to
+ * a single stream, the default association, as specifed during ::cudaMallocManaged,
+ * is restored when that stream is destroyed. For __managed__ variables, the
+ * default association is always ::cudaMemAttachGlobal. Note that destroying a
+ * stream is an asynchronous operation, and as a result, the change to default
+ * association won't happen until all work in the stream has completed.
+ *
+ * Memory allocated with ::cudaMallocManaged should be released with ::cudaFree.
+ *
+ * Device memory oversubscription is possible for GPUs that have a non-zero value for the
+ * device attribute ::cudaDevAttrConcurrentManagedAccess. Managed memory on
+ * such GPUs may be evicted from device memory to host memory at any time by the Unified
+ * Memory driver in order to make room for other allocations.
+ *
+ * In a multi-GPU system where all GPUs have a non-zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess, managed memory may not be populated when this
+ * API returns and instead may be populated on access. In such systems, managed memory can
+ * migrate to any processor's memory at any time. The Unified Memory driver will employ heuristics to
+ * maintain data locality and prevent excessive page faults to the extent possible. The application
+ * can also guide the driver about memory usage patterns via ::cudaMemAdvise. The application
+ * can also explicitly migrate memory to a desired processor's memory via
+ * ::cudaMemPrefetchAsync.
+ *
+ * In a multi-GPU system where all of the GPUs have a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess and all the GPUs have peer-to-peer support
+ * with each other, the physical storage for managed memory is created on the GPU which is active
+ * at the time ::cudaMallocManaged is called. All other GPUs will reference the data at reduced
+ * bandwidth via peer mappings over the PCIe bus. The Unified Memory driver does not migrate
+ * memory among such GPUs.
+ *
+ * In a multi-GPU system where not all GPUs have peer-to-peer support with each other and
+ * where the value of the device attribute ::cudaDevAttrConcurrentManagedAccess
+ * is zero for at least one of those GPUs, the location chosen for physical storage of managed
+ * memory is system-dependent.
+ * - On Linux, the location chosen will be device memory as long as the current set of active
+ * contexts are on devices that either have peer-to-peer support with each other or have a
+ * non-zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If there is an active context on a GPU that does not have a non-zero value for that device
+ * attribute and it does not have peer-to-peer support with the other devices that have active
+ * contexts on them, then the location for physical storage will be 'zero-copy' or host memory.
+ * Note that this means that managed memory that is located in device memory is migrated to
+ * host memory if a new context is created on a GPU that doesn't have a non-zero value for
+ * the device attribute and does not support peer-to-peer with at least one of the other devices
+ * that has an active context. This in turn implies that context creation may fail if there is
+ * insufficient host memory to migrate all managed allocations.
+ * - On Windows, the physical storage is always created in 'zero-copy' or host memory.
+ * All GPUs will reference the data at reduced bandwidth over the PCIe bus. In these
+ * circumstances, use of the environment variable CUDA_VISIBLE_DEVICES is recommended to
+ * restrict CUDA to only use those GPUs that have peer-to-peer support.
+ * Alternatively, users can also set CUDA_MANAGED_FORCE_DEVICE_ALLOC to a non-zero
+ * value to force the driver to always use device memory for physical storage.
+ * When this environment variable is set to a non-zero value, all devices used in
+ * that process that support managed memory have to be peer-to-peer compatible
+ * with each other. The error ::cudaErrorInvalidDevice will be returned if a device
+ * that supports managed memory is used and it is not peer-to-peer compatible with
+ * any of the other managed memory supporting devices that were previously used in
+ * that process, even if ::cudaDeviceReset has been called on those devices. These
+ * environment variables are described in the CUDA programming guide under the
+ * "CUDA environment variables" section.
+ * - On ARM, managed memory is not available on discrete gpu with Drive PX-2.
+ *
+ * \param devPtr - Pointer to allocated device memory
+ * \param size   - Requested allocation size in bytes
+ * \param flags  - Must be either ::cudaMemAttachGlobal or ::cudaMemAttachHost (defaults to ::cudaMemAttachGlobal)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorMemoryAllocation,
+ * ::cudaErrorNotSupported,
+ * ::cudaErrorInvalidValue
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMallocPitch, ::cudaFree, ::cudaMallocArray, ::cudaFreeArray,
+ * ::cudaMalloc3D, ::cudaMalloc3DArray,
+ * \ref ::cudaMallocHost(void**, size_t) "cudaMallocHost (C API)",
+ * ::cudaFreeHost, ::cudaHostAlloc, ::cudaDeviceGetAttribute, ::cudaStreamAttachMemAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocManaged(
+  T            **devPtr,
+  size_t         size,
+  unsigned int   flags = cudaMemAttachGlobal
+)
+{
+  return ::cudaMallocManaged((void**)(void*)devPtr, size, flags);
+}
+
+/**
+ * \brief Attach memory to a stream asynchronously
+ *
+ * Enqueues an operation in \p stream to specify stream association of
+ * \p length bytes of memory starting from \p devPtr. This function is a
+ * stream-ordered operation, meaning that it is dependent on, and will
+ * only take effect when, previous work in stream has completed. Any
+ * previous association is automatically replaced.
+ *
+ * \p devPtr must point to an one of the following types of memories:
+ * - managed memory declared using the __managed__ keyword or allocated with
+ *   ::cudaMallocManaged.
+ * - a valid host-accessible region of system-allocated pageable memory. This
+ *   type of memory may only be specified if the device associated with the
+ *   stream reports a non-zero value for the device attribute
+ *   ::cudaDevAttrPageableMemoryAccess.
+ *
+ * For managed allocations, \p length must be either zero or the entire
+ * allocation's size. Both indicate that the entire allocation's stream
+ * association is being changed. Currently, it is not possible to change stream
+ * association for a portion of a managed allocation.
+ *
+ * For pageable allocations, \p length must be non-zero.
+ *
+ * The stream association is specified using \p flags which must be
+ * one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle.
+ * The default value for \p flags is ::cudaMemAttachSingle
+ * If the ::cudaMemAttachGlobal flag is specified, the memory can be accessed
+ * by any stream on any device.
+ * If the ::cudaMemAttachHost flag is specified, the program makes a guarantee
+ * that it won't access the memory on the device from any stream on a device that
+ * has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess.
+ * If the ::cudaMemAttachSingle flag is specified and \p stream is associated with
+ * a device that has a zero value for the device attribute ::cudaDevAttrConcurrentManagedAccess,
+ * the program makes a guarantee that it will only access the memory on the device
+ * from \p stream. It is illegal to attach singly to the NULL stream, because the
+ * NULL stream is a virtual global stream and not a specific stream. An error will
+ * be returned in this case.
+ *
+ * When memory is associated with a single stream, the Unified Memory system will
+ * allow CPU access to this memory region so long as all operations in \p stream
+ * have completed, regardless of whether other streams are active. In effect,
+ * this constrains exclusive ownership of the managed memory region by
+ * an active GPU to per-stream activity instead of whole-GPU activity.
+ *
+ * Accessing memory on the device from streams that are not associated with
+ * it will produce undefined results. No error checking is performed by the
+ * Unified Memory system to ensure that kernels launched into other streams
+ * do not access this region.
+ *
+ * It is a program's responsibility to order calls to ::cudaStreamAttachMemAsync
+ * via events, synchronization or other means to ensure legal access to memory
+ * at all times. Data visibility and coherency will be changed appropriately
+ * for all kernels which follow a stream-association change.
+ *
+ * If \p stream is destroyed while data is associated with it, the association is
+ * removed and the association reverts to the default visibility of the allocation
+ * as specified at ::cudaMallocManaged. For __managed__ variables, the default
+ * association is always ::cudaMemAttachGlobal. Note that destroying a stream is an
+ * asynchronous operation, and as a result, the change to default association won't
+ * happen until all work in the stream has completed.
+ *
+ * \param stream  - Stream in which to enqueue the attach operation
+ * \param devPtr  - Pointer to memory (must be a pointer to managed memory or
+ *                  to a valid host-accessible region of system-allocated
+ *                  memory)
+ * \param length  - Length of memory (defaults to zero)
+ * \param flags   - Must be one of ::cudaMemAttachGlobal, ::cudaMemAttachHost or ::cudaMemAttachSingle (defaults to ::cudaMemAttachSingle)
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorNotReady,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidResourceHandle
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaStreamCreate, ::cudaStreamCreateWithFlags, ::cudaStreamWaitEvent, ::cudaStreamSynchronize, ::cudaStreamAddCallback, ::cudaStreamDestroy, ::cudaMallocManaged
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaStreamAttachMemAsync(
+  cudaStream_t   stream,
+  T              *devPtr,
+  size_t         length = 0,
+  unsigned int   flags  = cudaMemAttachSingle
+)
+{
+  return ::cudaStreamAttachMemAsync(stream, (void*)devPtr, length, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMalloc(
+  T      **devPtr,
+  size_t   size
+)
+{
+  return ::cudaMalloc((void**)(void*)devPtr, size);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocHost(
+  T            **ptr,
+  size_t         size,
+  unsigned int   flags = 0
+)
+{
+  return cudaMallocHost((void**)(void*)ptr, size, flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocPitch(
+  T      **devPtr,
+  size_t  *pitch,
+  size_t   width,
+  size_t   height
+)
+{
+  return ::cudaMallocPitch((void**)(void*)devPtr, pitch, width, height);
+}
+
+/**
+ * \brief Allocate from a pool
+ *
+ * This is an alternate spelling for cudaMallocFromPoolAsync
+ * made available through operator overloading.
+ *
+ * \sa ::cudaMallocFromPoolAsync,
+ * \ref ::cudaMallocAsync(void** ptr, size_t size, cudaStream_t hStream)  "cudaMallocAsync (C API)"
+ */
+static __inline__ __host__ cudaError_t cudaMallocAsync(
+  void        **ptr,
+  size_t        size,
+  cudaMemPool_t memPool,
+  cudaStream_t  stream
+)
+{
+  return ::cudaMallocFromPoolAsync(ptr, size, memPool, stream);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocAsync(
+  T           **ptr,
+  size_t        size,
+  cudaMemPool_t memPool,
+  cudaStream_t  stream
+)
+{
+  return ::cudaMallocFromPoolAsync((void**)(void*)ptr, size, memPool, stream);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocAsync(
+  T           **ptr,
+  size_t        size,
+  cudaStream_t  stream
+)
+{
+  return ::cudaMallocAsync((void**)(void*)ptr, size, stream);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaMallocFromPoolAsync(
+  T           **ptr,
+  size_t        size,
+  cudaMemPool_t memPool,
+  cudaStream_t  stream
+)
+{
+  return ::cudaMallocFromPoolAsync((void**)(void*)ptr, size, memPool, stream);
+}
+
+#if defined(__CUDACC__)
+
+/**
+ * \brief \hl Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
+ *
+ * \param symbol - Device symbol reference
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyToSymbol(
+  const T                   &symbol,
+  const void                *src,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyHostToDevice
+)
+{
+  return ::cudaMemcpyToSymbol((const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief \hl Copies data to the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area pointed to by \p src
+ * to the memory area \p offset bytes from the start of symbol
+ * \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice or ::cudaMemcpyDeviceToDevice.
+ *
+ * ::cudaMemcpyToSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally
+ * be associated to a stream by passing a non-zero \p stream argument. If
+ * \p kind is ::cudaMemcpyHostToDevice and \p stream is non-zero, the copy
+ * may overlap with operations in other streams.
+ *
+ * \param symbol - Device symbol reference
+ * \param src    - Source memory address
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyFromSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyToSymbolAsync(
+  const T                   &symbol,
+  const void                *src,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyHostToDevice,
+        cudaStream_t         stream = 0
+)
+{
+  return ::cudaMemcpyToSymbolAsync((const void*)&symbol, src, count, offset, kind, stream);
+}
+
+/**
+ * \brief \hl Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol reference
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_sync
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync, ::cudaMemcpyFromSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyFromSymbol(
+        void                *dst,
+  const T                   &symbol,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyDeviceToHost
+)
+{
+  return ::cudaMemcpyFromSymbol(dst, (const void*)&symbol, count, offset, kind);
+}
+
+/**
+ * \brief \hl Copies data from the given symbol on the device
+ *
+ * Copies \p count bytes from the memory area \p offset bytes
+ * from the start of symbol \p symbol to the memory area pointed to by \p dst.
+ * The memory areas may not overlap. \p symbol is a variable that resides in
+ * global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost or ::cudaMemcpyDeviceToDevice.
+ *
+ * ::cudaMemcpyFromSymbolAsync() is asynchronous with respect to the host, so
+ * the call may return before the copy is complete. The copy can optionally be
+ * associated to a stream by passing a non-zero \p stream argument. If \p kind
+ * is ::cudaMemcpyDeviceToHost and \p stream is non-zero, the copy may overlap
+ * with operations in other streams.
+ *
+ * \param dst    - Destination memory address
+ * \param symbol - Device symbol reference
+ * \param count  - Size in bytes to copy
+ * \param offset - Offset from start of symbol in bytes
+ * \param kind   - Type of transfer
+ * \param stream - Stream identifier
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorInvalidMemcpyDirection,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_async
+ * \note_string_api_deprecation
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaMemcpy, ::cudaMemcpy2D,
+ * ::cudaMemcpy2DToArray, ::cudaMemcpy2DFromArray,
+ * ::cudaMemcpy2DArrayToArray, ::cudaMemcpyToSymbol,
+ * ::cudaMemcpyFromSymbol, ::cudaMemcpyAsync, ::cudaMemcpy2DAsync,
+ * ::cudaMemcpy2DToArrayAsync,
+ * ::cudaMemcpy2DFromArrayAsync,
+ * ::cudaMemcpyToSymbolAsync
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaMemcpyFromSymbolAsync(
+        void                *dst,
+  const T                   &symbol,
+        size_t               count,
+        size_t               offset = 0,
+        enum cudaMemcpyKind  kind   = cudaMemcpyDeviceToHost,
+        cudaStream_t         stream = 0
+)
+{
+  return ::cudaMemcpyFromSymbolAsync(dst, (const void*)&symbol, count, offset, kind, stream);
+}
+
+/**
+ * \brief Creates a memcpy node to copy to a symbol on the device and adds it to a graph
+ *
+ * Creates a new memcpy node to copy to \p symbol and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p src to the memory area pointed to by \p offset bytes from the start
+ * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyToSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphAddMemcpyNodeToSymbol(
+    cudaGraphNode_t *pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t *pDependencies,
+    size_t numDependencies,
+    const T &symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphAddMemcpyNodeToSymbol(pGraphNode, graph, pDependencies, numDependencies, (const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief Creates a memcpy node to copy from a symbol on the device and adds it to a graph
+ *
+ * Creates a new memcpy node to copy from \p symbol and adds it to \p graph with
+ * \p numDependencies dependencies specified via \p pDependencies.
+ * It is possible for \p numDependencies to be 0, in which case the node will be placed
+ * at the root of the graph. \p pDependencies may not have any duplicate entries.
+ * A handle to the new node will be returned in \p pGraphNode.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
+ *  pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
+ *  that resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * Memcpy nodes have some additional restrictions with regards to managed memory, if the
+ * system contains at least one device which has a zero value for the device attribute
+ * ::cudaDevAttrConcurrentManagedAccess.
+ *
+ * \param pGraphNode      - Returns newly created node
+ * \param graph           - Graph to which to add the node
+ * \param pDependencies   - Dependencies of the node
+ * \param numDependencies - Number of dependencies
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyFromSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphMemcpyNodeGetParams,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphCreate,
+ * ::cudaGraphDestroyNode,
+ * ::cudaGraphAddChildGraphNode,
+ * ::cudaGraphAddEmptyNode,
+ * ::cudaGraphAddKernelNode,
+ * ::cudaGraphAddHostNode,
+ * ::cudaGraphAddMemsetNode
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphAddMemcpyNodeFromSymbol(
+    cudaGraphNode_t* pGraphNode,
+    cudaGraph_t graph,
+    const cudaGraphNode_t* pDependencies,
+    size_t numDependencies,
+    void* dst,
+    const T &symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphAddMemcpyNodeFromSymbol(pGraphNode, graph, pDependencies, numDependencies, dst, (const void*)&symbol, count, offset, kind);
+}
+
+/**
+ * \brief Sets a memcpy node's parameters to copy to a symbol on the device
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p src to the memory area pointed to by \p offset bytes from the start
+ * of symbol \p symbol. The memory areas may not overlap. \p symbol is a variable that
+ * resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyHostToDevice, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of
+ * transfer is inferred from the pointer values. However, ::cudaMemcpyDefault
+ * is only allowed on systems that support unified virtual addressing.
+ *
+ * \param node            - Node to set the parameters for
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyToSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphMemcpyNodeSetParamsToSymbol(
+    cudaGraphNode_t node,
+    const T &symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphMemcpyNodeSetParamsToSymbol(node, (const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief Sets a memcpy node's parameters to copy from a symbol on the device
+ *
+ * Sets the parameters of memcpy node \p node to the copy described by the provided parameters.
+ *
+ * When the graph is launched, the node will copy \p count bytes from the memory area
+ * pointed to by \p offset bytes from the start of symbol \p symbol to the memory area
+ *  pointed to by \p dst. The memory areas may not overlap. \p symbol is a variable
+ *  that resides in global or constant memory space. \p kind can be either
+ * ::cudaMemcpyDeviceToHost, ::cudaMemcpyDeviceToDevice, or ::cudaMemcpyDefault.
+ * Passing ::cudaMemcpyDefault is recommended, in which case the type of transfer
+ * is inferred from the pointer values. However, ::cudaMemcpyDefault is only
+ * allowed on systems that support unified virtual addressing.
+ *
+ * \param node            - Node to set the parameters for
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaMemcpyFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphMemcpyNodeGetParams
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphMemcpyNodeSetParamsFromSymbol(
+    cudaGraphNode_t node,
+    void* dst,
+    const T &symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphMemcpyNodeSetParamsFromSymbol(node, dst, (const void*)&symbol, count, offset, kind);
+}
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to copy to a symbol on the device
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p src and \p symbol must be allocated from the same contexts as the original source and
+ * destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param symbol          - Device symbol address
+ * \param src             - Source memory address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeToSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphExecMemcpyNodeSetParamsToSymbol(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    const T &symbol,
+    const void* src,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+    return ::cudaGraphExecMemcpyNodeSetParamsToSymbol(hGraphExec, node, (const void*)&symbol, src, count, offset, kind);
+}
+
+/**
+ * \brief Sets the parameters for a memcpy node in the given graphExec to copy from a symbol on the device
+ *
+ * Updates the work represented by \p node in \p hGraphExec as though \p node had 
+ * contained the given params at instantiation.  \p node must remain in the graph which was 
+ * used to instantiate \p hGraphExec.  Changed edges to and from \p node are ignored.
+ *
+ * \p symbol and \p dst must be allocated from the same contexts as the original source and
+ * destination memory.  The instantiation-time memory operands must be 1-dimensional.
+ * Zero-length operations are not supported.
+ *
+ * The modifications only affect future launches of \p hGraphExec.  Already enqueued 
+ * or running launches of \p hGraphExec are not affected by this call.  \p node is also 
+ * not modified by this call.
+ *
+ * Returns ::cudaErrorInvalidValue if the memory operands' mappings changed or
+ * the original memory operands are multidimensional.
+ *
+ * \param hGraphExec      - The executable graph in which to set the specified node
+ * \param node            - Memcpy node from the graph which was used to instantiate graphExec
+ * \param dst             - Destination memory address
+ * \param symbol          - Device symbol address
+ * \param count           - Size in bytes to copy
+ * \param offset          - Offset from start of symbol in bytes
+ * \param kind            - Type of transfer
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ * \note_graph_thread_safety
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaGraphAddMemcpyNode,
+ * ::cudaGraphAddMemcpyNodeFromSymbol,
+ * ::cudaGraphMemcpyNodeSetParams,
+ * ::cudaGraphMemcpyNodeSetParamsFromSymbol,
+ * ::cudaGraphInstantiate,
+ * ::cudaGraphExecMemcpyNodeSetParams,
+ * ::cudaGraphExecMemcpyNodeSetParamsToSymbol,
+ * ::cudaGraphExecKernelNodeSetParams,
+ * ::cudaGraphExecMemsetNodeSetParams,
+ * ::cudaGraphExecHostNodeSetParams
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGraphExecMemcpyNodeSetParamsFromSymbol(
+    cudaGraphExec_t hGraphExec,
+    cudaGraphNode_t node,
+    void* dst,
+    const T &symbol,
+    size_t count,
+    size_t offset,
+    enum cudaMemcpyKind kind)
+{
+  return ::cudaGraphExecMemcpyNodeSetParamsFromSymbol(hGraphExec, node, dst, (const void*)&symbol, count, offset, kind);
+}
+
+#if __cplusplus >= 201103
+
+/**
+ * \brief Creates a user object by wrapping a C++ object
+ *
+ * TODO detail
+ *
+ * \param object_out      - Location to return the user object handle
+ * \param objectToWrap    - This becomes the \ptr argument to ::cudaUserObjectCreate. A
+ *                          lambda will be passed for the \p destroy argument, which calls
+ *                          delete on this object pointer.
+ * \param initialRefcount - The initial refcount to create the object with, typically 1. The
+ *                          initial references are owned by the calling thread.
+ * \param flags           - Currently it is required to pass cudaUserObjectNoDestructorSync,
+ *                          which is the only defined flag. This indicates that the destroy
+ *                          callback cannot be waited on by any CUDA API. Users requiring
+ *                          synchronization of the callback should signal its completion
+ *                          manually.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue
+ *
+ * \sa
+ * ::cudaUserObjectCreate
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaUserObjectCreate(
+    cudaUserObject_t *object_out,
+    T *objectToWrap,
+    unsigned int initialRefcount,
+    unsigned int flags)
+{
+    return ::cudaUserObjectCreate(
+            object_out,
+            objectToWrap,
+            [](void *vpObj) { delete reinterpret_cast<T *>(vpObj); },
+            initialRefcount,
+            flags);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaUserObjectCreate(
+    cudaUserObject_t *object_out,
+    T *objectToWrap,
+    unsigned int initialRefcount,
+    cudaUserObjectFlags flags)
+{
+    return cudaUserObjectCreate(object_out, objectToWrap, initialRefcount, (unsigned int)flags);
+}
+
+#endif
+
+/**
+ * \brief \hl Finds the address associated with a CUDA symbol
+ *
+ * Returns in \p *devPtr the address of symbol \p symbol on the device.
+ * \p symbol can either be a variable that resides in global or constant memory space.
+ * If \p symbol cannot be found, or if \p symbol is not declared
+ * in the global or constant memory space, \p *devPtr is unchanged and the error
+ * ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param devPtr - Return device pointer associated with symbol
+ * \param symbol - Device symbol reference
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaGetSymbolAddress(void**, const void*) "cudaGetSymbolAddress (C API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const T&) "cudaGetSymbolSize (C++ API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGetSymbolAddress(
+        void **devPtr,
+  const T     &symbol
+)
+{
+  return ::cudaGetSymbolAddress(devPtr, (const void*)&symbol);
+}
+
+/**
+ * \brief \hl Finds the size of the object associated with a CUDA symbol
+ *
+ * Returns in \p *size the size of symbol \p symbol. \p symbol must be a
+ * variable that resides in global or constant memory space.
+ * If \p symbol cannot be found, or if \p symbol is not declared
+ * in global or constant memory space, \p *size is unchanged and the error
+ * ::cudaErrorInvalidSymbol is returned.
+ *
+ * \param size   - Size of object associated with symbol
+ * \param symbol - Device symbol reference
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidSymbol,
+ * ::cudaErrorNoKernelImageForDevice
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaGetSymbolAddress(void**, const T&) "cudaGetSymbolAddress (C++ API)",
+ * \ref ::cudaGetSymbolSize(size_t*, const void*) "cudaGetSymbolSize (C API)"
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaGetSymbolSize(
+        size_t *size,
+  const T      &symbol
+)
+{
+  return ::cudaGetSymbolSize(size, (const void*)&symbol);
+}
+
+/**
+ * \brief \hl Binds a memory area to a texture
+ *
+ * Binds \p size bytes of the memory area pointed to by \p devPtr to texture
+ * reference \p tex. \p desc describes how the memory is interpreted when
+ * fetching values from the texture. The \p offset parameter is an optional
+ * byte offset as with the low-level
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture()"
+ * function. Any memory previously bound to \p tex is unbound.
+ *
+ * \param offset - Offset in bytes
+ * \param tex    - Texture to bind
+ * \param devPtr - Memory area on device
+ * \param desc   - Channel format
+ * \param size   - Size of the memory area pointed to by devPtr
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void                             *devPtr,
+  const struct cudaChannelFormatDesc     &desc,
+        size_t                            size = UINT_MAX
+)
+{
+  return ::cudaBindTexture(offset, &tex, devPtr, &desc, size);
+}
+
+/**
+ * \brief \hl Binds a memory area to a texture
+ *
+ * Binds \p size bytes of the memory area pointed to by \p devPtr to texture
+ * reference \p tex. The channel descriptor is inherited from the texture
+ * reference type. The \p offset parameter is an optional byte offset as with
+ * the low-level
+ * ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t)
+ * function. Any memory previously bound to \p tex is unbound.
+ *
+ * \param offset - Offset in bytes
+ * \param tex    - Texture to bind
+ * \param devPtr - Memory area on device
+ * \param size   - Size of the memory area pointed to by devPtr
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t) "cudaBindTexture (C API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void                             *devPtr,
+        size_t                            size = UINT_MAX
+)
+{
+  return cudaBindTexture(offset, tex, devPtr, tex.channelDesc, size);
+}
+
+/**
+ * \brief \hl Binds a 2D memory area to a texture
+ *
+ * Binds the 2D memory area pointed to by \p devPtr to the
+ * texture reference \p tex. The size of the area is constrained by
+ * \p width in texel units, \p height in texel units, and \p pitch in byte
+ * units. \p desc describes how the memory is interpreted when fetching values
+ * from the texture. Any memory previously bound to \p tex is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses,
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D()"
+ * returns in \p *offset a byte offset that
+ * must be applied to texture fetches in order to read from the desired memory.
+ * This offset must be divided by the texel size and passed to kernels that
+ * read from the texture so they can be applied to the ::tex2D() function.
+ * If the device memory pointer was returned from ::cudaMalloc(), the offset is
+ * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
+ *
+ * \param offset - Offset in bytes
+ * \param tex    - Texture reference to bind
+ * \param devPtr - 2D memory area on device
+ * \param desc   - Channel format
+ * \param width  - Width in texel units
+ * \param height - Height in texel units
+ * \param pitch  - Pitch in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture2D(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void                             *devPtr,
+  const struct cudaChannelFormatDesc     &desc,
+  size_t                                  width,
+  size_t                                  height,
+  size_t                                  pitch
+)
+{
+  return ::cudaBindTexture2D(offset, &tex, devPtr, &desc, width, height, pitch);
+}
+
+/**
+ * \brief \hl Binds a 2D memory area to a texture
+ *
+ * Binds the 2D memory area pointed to by \p devPtr to the
+ * texture reference \p tex. The size of the area is constrained by
+ * \p width in texel units, \p height in texel units, and \p pitch in byte
+ * units. The channel descriptor is inherited from the texture reference
+ * type. Any memory previously bound to \p tex is unbound.
+ *
+ * Since the hardware enforces an alignment requirement on texture base
+ * addresses,
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D()"
+ * returns in \p *offset a byte offset that
+ * must be applied to texture fetches in order to read from the desired memory.
+ * This offset must be divided by the texel size and passed to kernels that
+ * read from the texture so they can be applied to the ::tex2D() function.
+ * If the device memory pointer was returned from ::cudaMalloc(), the offset is
+ * guaranteed to be 0 and NULL may be passed as the \p offset parameter.
+ *
+ * \param offset - Offset in bytes
+ * \param tex    - Texture reference to bind
+ * \param devPtr - 2D memory area on device
+ * \param width  - Width in texel units
+ * \param height - Height in texel units
+ * \param pitch  - Pitch in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct textureReference*, const void*, const struct cudaChannelFormatDesc*, size_t, size_t, size_t) "cudaBindTexture2D (C API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode>&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTexture2D(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex,
+  const void                             *devPtr,
+  size_t                                  width,
+  size_t                                  height,
+  size_t                                  pitch
+)
+{
+  return ::cudaBindTexture2D(offset, &tex, devPtr, &tex.channelDesc, width, height, pitch);
+}
+
+/**
+ * \brief \hl Binds an array to a texture
+ *
+ * Binds the CUDA array \p array to the texture reference \p tex.
+ * \p desc describes how the memory is interpreted when fetching values from
+ * the texture. Any CUDA array previously bound to \p tex is unbound.
+ *
+ * \param tex   - Texture to bind
+ * \param array - Memory array on device
+ * \param desc  - Channel format
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToArray(
+  const struct texture<T, dim, readMode> &tex,
+  cudaArray_const_t                       array,
+  const struct cudaChannelFormatDesc     &desc
+)
+{
+  return ::cudaBindTextureToArray(&tex, array, &desc);
+}
+
+/**
+ * \brief \hl Binds an array to a texture
+ *
+ * Binds the CUDA array \p array to the texture reference \p tex.
+ * The channel descriptor is inherited from the CUDA array. Any CUDA array
+ * previously bound to \p tex is unbound.
+ *
+ * \param tex   - Texture to bind
+ * \param array - Memory array on device
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToArray(
+  const struct texture<T, dim, readMode> &tex,
+  cudaArray_const_t                       array
+)
+{
+  struct cudaChannelFormatDesc desc;
+  cudaError_t                  err = ::cudaGetChannelDesc(&desc, array);
+
+  return err == cudaSuccess ? cudaBindTextureToArray(tex, array, desc) : err;
+}
+
+/**
+ * \brief \hl Binds a mipmapped array to a texture
+ *
+ * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex.
+ * \p desc describes how the memory is interpreted when fetching values from
+ * the texture. Any CUDA mipmapped array previously bound to \p tex is unbound.
+ *
+ * \param tex            - Texture to bind
+ * \param mipmappedArray - Memory mipmapped array on device
+ * \param desc           - Channel format
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray(
+  const struct texture<T, dim, readMode> &tex,
+  cudaMipmappedArray_const_t              mipmappedArray,
+  const struct cudaChannelFormatDesc     &desc
+)
+{
+  return ::cudaBindTextureToMipmappedArray(&tex, mipmappedArray, &desc);
+}
+
+/**
+ * \brief \hl Binds a mipmapped array to a texture
+ *
+ * Binds the CUDA mipmapped array \p mipmappedArray to the texture reference \p tex.
+ * The channel descriptor is inherited from the CUDA array. Any CUDA mipmapped array
+ * previously bound to \p tex is unbound.
+ *
+ * \param tex            - Texture to bind
+ * \param mipmappedArray - Memory mipmapped array on device
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct textureReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindTextureToArray (C API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindTextureToMipmappedArray(
+  const struct texture<T, dim, readMode> &tex,
+  cudaMipmappedArray_const_t              mipmappedArray
+)
+{
+  struct cudaChannelFormatDesc desc;
+  cudaArray_t                  levelArray;
+  cudaError_t                  err = ::cudaGetMipmappedArrayLevel(&levelArray, mipmappedArray, 0);
+  
+  if (err != cudaSuccess) {
+      return err;
+  }
+  err = ::cudaGetChannelDesc(&desc, levelArray);
+
+  return err == cudaSuccess ? cudaBindTextureToMipmappedArray(tex, mipmappedArray, desc) : err;
+}
+
+/**
+ * \brief \hl Unbinds a texture
+ *
+ * Unbinds the texture bound to \p tex. If \p texref is not currently bound, no operation is performed.
+ *
+ * \param tex - Texture to unbind
+ *
+ * \return 
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidTexture
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct textureReference*) "cudaUnbindTexture (C API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct texture<T, dim, readMode >&) "cudaGetTextureAlignmentOffset (C++ API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaUnbindTexture(
+  const struct texture<T, dim, readMode> &tex
+)
+{
+  return ::cudaUnbindTexture(&tex);
+}
+
+/**
+ * \brief \hl Get the alignment offset of a texture
+ *
+ * Returns in \p *offset the offset that was returned when texture reference
+ * \p tex was bound.
+ *
+ * \param offset - Offset of texture reference in bytes
+ * \param tex    - Texture to get offset of
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidTexture,
+ * ::cudaErrorInvalidTextureBinding
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaCreateChannelDesc(void) "cudaCreateChannelDesc (C++ API)",
+ * ::cudaGetChannelDesc, ::cudaGetTextureReference,
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t) "cudaBindTexture (C++ API)",
+ * \ref ::cudaBindTexture(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t) "cudaBindTexture (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, const struct cudaChannelFormatDesc&, size_t, size_t, size_t) "cudaBindTexture2D (C++ API)",
+ * \ref ::cudaBindTexture2D(size_t*, const struct texture<T, dim, readMode>&, const void*, size_t, size_t, size_t) "cudaBindTexture2D (C++ API, inherited channel descriptor)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindTextureToArray (C++ API)",
+ * \ref ::cudaBindTextureToArray(const struct texture<T, dim, readMode>&, cudaArray_const_t) "cudaBindTextureToArray (C++ API, inherited channel descriptor)",
+ * \ref ::cudaUnbindTexture(const struct texture<T, dim, readMode>&) "cudaUnbindTexture (C++ API)",
+ * \ref ::cudaGetTextureAlignmentOffset(size_t*, const struct textureReference*) "cudaGetTextureAlignmentOffset (C API)"
+ */
+template<class T, int dim, enum cudaTextureReadMode readMode>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaGetTextureAlignmentOffset(
+        size_t                           *offset,
+  const struct texture<T, dim, readMode> &tex
+)
+{
+  return ::cudaGetTextureAlignmentOffset(offset, &tex);
+}
+
+/**
+ * \brief \hl Sets the preferred cache configuration for a device function
+ *
+ * On devices where the L1 cache and shared memory use the same hardware
+ * resources, this sets through \p cacheConfig the preferred cache configuration
+ * for the function specified via \p func. This is only a preference. The
+ * runtime will use the requested configuration if possible, but it is free to
+ * choose a different configuration if required to execute \p func.
+ *
+ * \p func must be a pointer to a function that executes on the device.
+ * The parameter specified by \p func must be declared as a \p __global__
+ * function. If the specified function does not exist,
+ * then ::cudaErrorInvalidDeviceFunction is returned.
+ *
+ * This setting does nothing on devices where the size of the L1 cache and
+ * shared memory are fixed.
+ *
+ * Launching a kernel with a different preference than the most recent
+ * preference setting may insert a device-side synchronization point.
+ *
+ * The supported cache configurations are:
+ * - ::cudaFuncCachePreferNone: no preference for shared memory or L1 (default)
+ * - ::cudaFuncCachePreferShared: prefer larger shared memory and smaller L1 cache
+ * - ::cudaFuncCachePreferL1: prefer larger L1 cache and smaller shared memory
+ *
+ * \param func        - device function pointer
+ * \param cacheConfig - Requested cache configuration
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(const void*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, T*) "cudaFuncGetAttributes (C++ API)",
+ * ::cudaSetDoubleForDevice,
+ * ::cudaSetDoubleForHost,
+ * ::cudaThreadGetCacheConfig,
+ * ::cudaThreadSetCacheConfig
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncSetCacheConfig(
+  T                  *func,
+  enum cudaFuncCache  cacheConfig
+)
+{
+  return ::cudaFuncSetCacheConfig((const void*)func, cacheConfig);
+}
+
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncSetSharedMemConfig(
+  T                        *func,
+  enum cudaSharedMemConfig  config
+)
+{
+  return ::cudaFuncSetSharedMemConfig((const void*)func, config);
+}
+
+#endif // __CUDACC__
+
+/**
+ * \brief Returns occupancy for a device function
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calulated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+    int   *numBlocks,
+    T      func,
+    int    blockSize,
+    size_t dynamicSMemSize)
+{
+    return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, cudaOccupancyDefault);
+}
+
+/**
+ * \brief Returns occupancy for a device function with the specified flags
+ *
+ * Returns in \p *numBlocks the maximum number of active blocks per
+ * streaming multiprocessor for the device function.
+ *
+ * The \p flags parameter controls how special cases are handled. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ *
+ * - ::cudaOccupancyDisableCachingOverride: suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param numBlocks       - Returned occupancy
+ * \param func            - Kernel function for which occupancy is calulated
+ * \param blockSize       - Block size the kernel is intended to be launched with
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param flags           - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+    int         *numBlocks,
+    T            func,
+    int          blockSize,
+    size_t       dynamicSMemSize,
+    unsigned int flags)
+{
+    return ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(numBlocks, (const void*)func, blockSize, dynamicSMemSize, flags);
+}
+
+/**
+ * Helper functor for cudaOccupancyMaxPotentialBlockSize
+ */
+class __cudaOccupancyB2DHelper {
+  size_t n;
+public:
+  inline __host__ CUDART_DEVICE __cudaOccupancyB2DHelper(size_t n_) : n(n_) {}
+  inline __host__ CUDART_DEVICE size_t operator()(int)
+  {
+      return n;
+  }
+};
+
+/**
+ * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * The \p flags parameter controls how special cases are handled. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ *
+ * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ * \param flags       - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+
+template<typename UnaryFunction, class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(
+    int           *minGridSize,
+    int           *blockSize,
+    T              func,
+    UnaryFunction  blockSizeToDynamicSMemSize,
+    int            blockSizeLimit = 0,
+    unsigned int   flags = 0)
+{
+    cudaError_t status;
+
+    // Device and function properties
+    int                       device;
+    struct cudaFuncAttributes attr;
+
+    // Limits
+    int maxThreadsPerMultiProcessor;
+    int warpSize;
+    int devMaxThreadsPerBlock;
+    int multiProcessorCount;
+    int funcMaxThreadsPerBlock;
+    int occupancyLimit;
+    int granularity;
+
+    // Recorded maximum
+    int maxBlockSize = 0;
+    int numBlocks    = 0;
+    int maxOccupancy = 0;
+
+    // Temporary
+    int blockSizeToTryAligned;
+    int blockSizeToTry;
+    int blockSizeLimitAligned;
+    int occupancyInBlocks;
+    int occupancyInThreads;
+    size_t dynamicSMemSize;
+
+    ///////////////////////////
+    // Check user input
+    ///////////////////////////
+
+    if (!minGridSize || !blockSize || !func) {
+        return cudaErrorInvalidValue;
+    }
+
+    //////////////////////////////////////////////
+    // Obtain device and function properties
+    //////////////////////////////////////////////
+
+    status = ::cudaGetDevice(&device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &maxThreadsPerMultiProcessor,
+        cudaDevAttrMaxThreadsPerMultiProcessor,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &warpSize,
+        cudaDevAttrWarpSize,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &devMaxThreadsPerBlock,
+        cudaDevAttrMaxThreadsPerBlock,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaDeviceGetAttribute(
+        &multiProcessorCount,
+        cudaDevAttrMultiProcessorCount,
+        device);
+    if (status != cudaSuccess) {
+        return status;
+    }
+
+    status = cudaFuncGetAttributes(&attr, func);
+    if (status != cudaSuccess) {
+        return status;
+    }
+    
+    funcMaxThreadsPerBlock = attr.maxThreadsPerBlock;
+
+    /////////////////////////////////////////////////////////////////////////////////
+    // Try each block size, and pick the block size with maximum occupancy
+    /////////////////////////////////////////////////////////////////////////////////
+
+    occupancyLimit = maxThreadsPerMultiProcessor;
+    granularity    = warpSize;
+
+    if (blockSizeLimit == 0) {
+        blockSizeLimit = devMaxThreadsPerBlock;
+    }
+
+    if (devMaxThreadsPerBlock < blockSizeLimit) {
+        blockSizeLimit = devMaxThreadsPerBlock;
+    }
+
+    if (funcMaxThreadsPerBlock < blockSizeLimit) {
+        blockSizeLimit = funcMaxThreadsPerBlock;
+    }
+
+    blockSizeLimitAligned = ((blockSizeLimit + (granularity - 1)) / granularity) * granularity;
+
+    for (blockSizeToTryAligned = blockSizeLimitAligned; blockSizeToTryAligned > 0; blockSizeToTryAligned -= granularity) {
+        // This is needed for the first iteration, because
+        // blockSizeLimitAligned could be greater than blockSizeLimit
+        //
+        if (blockSizeLimit < blockSizeToTryAligned) {
+            blockSizeToTry = blockSizeLimit;
+        } else {
+            blockSizeToTry = blockSizeToTryAligned;
+        }
+        
+        dynamicSMemSize = blockSizeToDynamicSMemSize(blockSizeToTry);
+
+        status = cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags(
+            &occupancyInBlocks,
+            func,
+            blockSizeToTry,
+            dynamicSMemSize,
+            flags);
+
+        if (status != cudaSuccess) {
+            return status;
+        }
+
+        occupancyInThreads = blockSizeToTry * occupancyInBlocks;
+
+        if (occupancyInThreads > maxOccupancy) {
+            maxBlockSize = blockSizeToTry;
+            numBlocks    = occupancyInBlocks;
+            maxOccupancy = occupancyInThreads;
+        }
+
+        // Early out if we have reached the maximum
+        //
+        if (occupancyLimit == maxOccupancy) {
+            break;
+        }
+    }
+
+    ///////////////////////////
+    // Return best available
+    ///////////////////////////
+
+    // Suggested min grid size to achieve a full machine launch
+    //
+    *minGridSize = numBlocks * multiProcessorCount;
+    *blockSize = maxBlockSize;
+
+    return status;
+}
+
+/**
+ * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param blockSizeToDynamicSMemSize - A unary function / functor that takes block size, and returns the size, in bytes, of dynamic shared memory needed for a block
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+
+template<typename UnaryFunction, class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeVariableSMem(
+    int           *minGridSize,
+    int           *blockSize,
+    T              func,
+    UnaryFunction  blockSizeToDynamicSMemSize,
+    int            blockSizeLimit = 0)
+{
+    return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, blockSizeToDynamicSMemSize, blockSizeLimit, cudaOccupancyDefault);
+}
+
+/**
+ * \brief Returns grid and block size that achieves maximum potential occupancy for a device function
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the
+ * amount of per-block dynamic shared memory changes with different
+ * block sizes.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+template<class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSize(
+    int    *minGridSize,
+    int    *blockSize,
+    T       func,
+    size_t  dynamicSMemSize = 0,
+    int     blockSizeLimit = 0)
+{
+  return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, cudaOccupancyDefault);
+}
+
+/**
+ * \brief Returns dynamic shared memory available per block when launching \p numBlocks blocks on SM.
+ *
+ * Returns in \p *dynamicSmemSize the maximum size of dynamic shared memory to allow \p numBlocks blocks per SM. 
+ *
+ * \param dynamicSmemSize - Returned maximum dynamic shared memory 
+ * \param func            - Kernel function for which occupancy is calculated
+ * \param numBlocks       - Number of blocks to fit on SM 
+ * \param blockSize       - Size of the block
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeWithFlags
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyAvailableDynamicSMemPerBlock(
+    size_t *dynamicSmemSize,
+    T      func,
+    int    numBlocks,
+    int    blockSize)
+{
+    return ::cudaOccupancyAvailableDynamicSMemPerBlock(dynamicSmemSize, (const void*)func, numBlocks, blockSize);
+}
+
+/**
+ * \brief Returns grid and block size that achived maximum potential occupancy for a device function with the specified flags
+ *
+ * Returns in \p *minGridSize and \p *blocksize a suggested grid /
+ * block size pair that achieves the best potential occupancy
+ * (i.e. the maximum number of active warps with the smallest number
+ * of blocks).
+ *
+ * The \p flags parameter controls how special cases are handle. Valid flags include:
+ *
+ * - ::cudaOccupancyDefault: keeps the default behavior as
+ *   ::cudaOccupancyMaxPotentialBlockSize
+ *
+ * - ::cudaOccupancyDisableCachingOverride: This flag suppresses the default behavior
+ *   on platform where global caching affects occupancy. On such platforms, if caching
+ *   is enabled, but per-block SM resource usage would result in zero occupancy, the
+ *   occupancy calculator will calculate the occupancy as if caching is disabled.
+ *   Setting this flag makes the occupancy calculator to return 0 in such cases.
+ *   More information can be found about this feature in the "Unified L1/Texture Cache"
+ *   section of the Maxwell tuning guide.
+ *
+ * Use \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem if the
+ * amount of per-block dynamic shared memory changes with different
+ * block sizes.
+ *
+ * \param minGridSize - Returned minimum grid size needed to achieve the best potential occupancy
+ * \param blockSize   - Returned block size
+ * \param func        - Device function symbol
+ * \param dynamicSMemSize - Per-block dynamic shared memory usage intended, in bytes
+ * \param blockSizeLimit  - The maximum block size \p func is designed to work with. 0 means no limit.
+ * \param flags       - Requested behavior for the occupancy calculator
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDevice,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa ::cudaOccupancyMaxPotentialBlockSize
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+ * \sa ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMem
+ * \sa ::cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags
+ * \sa ::cudaOccupancyAvailableDynamicSMemPerBlock
+ */
+template<class T>
+static __inline__ __host__ CUDART_DEVICE cudaError_t cudaOccupancyMaxPotentialBlockSizeWithFlags(
+    int    *minGridSize,
+    int    *blockSize,
+    T      func,
+    size_t dynamicSMemSize = 0,
+    int    blockSizeLimit = 0,
+    unsigned int flags = 0)
+{
+    return cudaOccupancyMaxPotentialBlockSizeVariableSMemWithFlags(minGridSize, blockSize, func, __cudaOccupancyB2DHelper(dynamicSMemSize), blockSizeLimit, flags);
+}
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum cluster size in \p *clusterSize.
+ *
+ * The cluster dimensions in \p config are ignored. If func has a required
+ * cluster size set (see ::cudaFuncGetAttributes),\p *clusterSize will reflect 
+ * the required cluster size.
+ *
+ * By default this function will always return a value that's portable on
+ * future hardware. A higher value may be returned if the kernel function
+ * allows non-portable cluster sizes.
+ *
+ * This function will respect the compile time launch bounds.
+ *
+ * \param clusterSize - Returned maximum cluster size that can be launched
+ *                      for the given kernel function and launch configuration
+ * \param func        - Kernel function for which maximum cluster
+ *                      size is calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaFuncGetAttributes
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxPotentialClusterSize(
+    int *clusterSize,
+    T *func,
+    const cudaLaunchConfig_t *config)
+{
+    return ::cudaOccupancyMaxPotentialClusterSize(clusterSize, (const void*)func, config);
+}
+
+/**
+ * \brief Given the kernel function (\p func) and launch configuration
+ * (\p config), return the maximum number of clusters that could co-exist
+ * on the target device in \p *numClusters.
+ *
+ * If the function has required cluster size already set (see
+ * ::cudaFuncGetAttributes), the cluster size from config must either be
+ * unspecified or match the required size.
+ * Without required sizes, the cluster size must be specified in config,
+ * else the function will return an error.
+ *
+ * Note that various attributes of the kernel function may affect occupancy
+ * calculation. Runtime environment may affect how the hardware schedules
+ * the clusters, so the calculated occupancy is not guaranteed to be achievable.
+ *
+ * \param numClusters - Returned maximum number of clusters that
+ *                      could co-exist on the target device
+ * \param func        - Kernel function for which maximum number
+ *                      of clusters are calculated
+ * \param config      - Launch configuration for the given kernel function
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidClusterSize,
+ * ::cudaErrorUnknown,
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa
+ * ::cudaFuncGetAttributes
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaOccupancyMaxActiveClusters(
+    int *numClusters,
+    T *func,
+    const cudaLaunchConfig_t *config)
+{
+    return ::cudaOccupancyMaxActiveClusters(numClusters, (const void*)func, config);
+}
+
+#if defined __CUDACC__
+
+/**
+ * \brief \hl Find out attributes for a given function
+ *
+ * This function obtains the attributes of a function specified via \p entry.
+ * The parameter \p entry must be a pointer to a function that executes
+ * on the device. The parameter specified by \p entry must be declared as a \p __global__
+ * function. The fetched attributes are placed in \p attr. If the specified
+ * function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
+ *
+ * Note that some function attributes such as
+ * \ref ::cudaFuncAttributes::maxThreadsPerBlock "maxThreadsPerBlock"
+ * may vary based on the device that is currently being used.
+ *
+ * \param attr  - Return pointer to function's attributes
+ * \param entry - Function to get attributes of
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForDevice,
+ * ::cudaSetDoubleForHost
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncGetAttributes(
+  struct cudaFuncAttributes *attr,
+  T                         *entry
+)
+{
+  return ::cudaFuncGetAttributes(attr, (const void*)entry);
+}
+
+/**
+ * \brief \hl Set attributes for a given function
+ *
+ * This function sets the attributes of a function specified via \p entry.
+ * The parameter \p entry must be a pointer to a function that executes
+ * on the device. The parameter specified by \p entry must be declared as a \p __global__
+ * function. The enumeration defined by \p attr is set to the value defined by \p value.
+ * If the specified function does not exist, then ::cudaErrorInvalidDeviceFunction is returned.
+ * If the specified attribute cannot be written, or if the value is incorrect, 
+ * then ::cudaErrorInvalidValue is returned.
+ *
+ * Valid values for \p attr are:
+ * - ::cudaFuncAttributeMaxDynamicSharedMemorySize - The requested maximum size in bytes of dynamically-allocated shared memory. The sum of this value and the function attribute ::sharedSizeBytes
+ *   cannot exceed the device attribute ::cudaDevAttrMaxSharedMemoryPerBlockOptin. The maximal size of requestable dynamic shared memory may differ by GPU architecture.
+ * - ::cudaFuncAttributePreferredSharedMemoryCarveout - On devices where the L1 cache and shared memory use the same hardware resources, 
+ *   this sets the shared memory carveout preference, in percent of the total shared memory. See ::cudaDevAttrMaxSharedMemoryPerMultiprocessor.
+ *   This is only a hint, and the driver can choose a different ratio if required to execute the function.
+ * - ::cudaFuncAttributeRequiredClusterWidth: The required cluster width in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeRequiredClusterHeight: The required cluster height in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeRequiredClusterDepth: The required cluster depth in
+ *   blocks. The width, height, and depth values must either all be 0 or all be
+ *   positive. The validity of the cluster dimensions is checked at launch time.
+ *   If the value is set during compile time, it cannot be set at runtime.
+ *   Setting it at runtime will return cudaErrorNotPermitted.
+ * - ::cudaFuncAttributeClusterSchedulingPolicyPreference: The block
+ *   scheduling policy of a function. The value type is cudaClusterSchedulingPolicy.
+ *
+ * \param entry - Function to get attributes of
+ * \param attr  - Attribute to set
+ * \param value - Value to set
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidDeviceFunction,
+ * ::cudaErrorInvalidValue
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \ref ::cudaLaunchKernel(const T *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream) "cudaLaunchKernel (C++ API)",
+ * \ref ::cudaFuncSetCacheConfig(T*, enum cudaFuncCache) "cudaFuncSetCacheConfig (C++ API)",
+ * \ref ::cudaFuncGetAttributes(struct cudaFuncAttributes*, const void*) "cudaFuncGetAttributes (C API)",
+ * ::cudaSetDoubleForDevice,
+ * ::cudaSetDoubleForHost
+ */
+template<class T>
+static __inline__ __host__ cudaError_t cudaFuncSetAttribute(
+  T                         *entry,
+  enum cudaFuncAttribute    attr,
+  int                       value
+)
+{
+  return ::cudaFuncSetAttribute((const void*)entry, attr, value);
+}
+
+/**
+ * \brief \hl Binds an array to a surface
+ *
+ * Binds the CUDA array \p array to the surface reference \p surf.
+ * \p desc describes how the memory is interpreted when dealing with
+ * the surface. Any CUDA array previously bound to \p surf is unbound.
+ *
+ * \param surf  - Surface to bind
+ * \param array - Memory array on device
+ * \param desc  - Channel format
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSurface
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)",
+ * \ref ::cudaBindSurfaceToArray(const struct surface<T, dim>&, cudaArray_const_t) "cudaBindSurfaceToArray (C++ API, inherited channel descriptor)"
+ */
+template<class T, int dim>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindSurfaceToArray(
+  const struct surface<T, dim>       &surf,
+  cudaArray_const_t                   array,
+  const struct cudaChannelFormatDesc &desc
+)
+{
+  return ::cudaBindSurfaceToArray(&surf, array, &desc);
+}
+
+/**
+ * \brief \hl Binds an array to a surface
+ *
+ * Binds the CUDA array \p array to the surface reference \p surf.
+ * The channel descriptor is inherited from the CUDA array. Any CUDA array
+ * previously bound to \p surf is unbound.
+ *
+ * \param surf  - Surface to bind
+ * \param array - Memory array on device
+ *
+ * \return
+ * ::cudaSuccess,
+ * ::cudaErrorInvalidValue,
+ * ::cudaErrorInvalidSurface
+ * \notefnerr
+ * \note_init_rt
+ * \note_callback
+ *
+ * \sa \ref ::cudaBindSurfaceToArray(const struct surfaceReference*, cudaArray_const_t, const struct cudaChannelFormatDesc*) "cudaBindSurfaceToArray (C API)",
+ * \ref ::cudaBindSurfaceToArray(const struct surface<T, dim>&, cudaArray_const_t, const struct cudaChannelFormatDesc&) "cudaBindSurfaceToArray (C++ API)"
+ */
+template<class T, int dim>
+static __CUDA_DEPRECATED __inline__ __host__ cudaError_t cudaBindSurfaceToArray(
+  const struct surface<T, dim> &surf,
+  cudaArray_const_t             array
+)
+{
+  struct cudaChannelFormatDesc desc;
+  cudaError_t                  err = ::cudaGetChannelDesc(&desc, array);
+
+  return err == cudaSuccess ? cudaBindSurfaceToArray(surf, array, desc) : err;
+}
+
+#endif /* __CUDACC__ */
+
+/** @} */ /* END CUDART_HIGHLEVEL */
+
+#endif /* __cplusplus && !__CUDACC_RTC__ */
+
+#if !defined(__CUDACC_RTC__)
+#if defined(__GNUC__)
+#if defined(__clang__) || (!defined(__PGIC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 6)))
+#pragma GCC diagnostic pop
+#endif
+#elif defined(_MSC_VER)
+#pragma warning(pop)
+#endif
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_CUDA_RUNTIME_H__
+#endif
+
+#endif /* !__CUDA_RUNTIME_H__ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..21c8d944930456a914ba86e0972dbf6ba1bc5bba
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cuda_texture_types.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_TEXTURE_TYPES_H__)
+#define __CUDA_TEXTURE_TYPES_H__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDACC_RTC__)
+#define EXCLUDE_FROM_RTC
+#include "channel_descriptor.h"
+#undef EXCLUDE_FROM_RTC
+#endif /* !__CUDACC_RTC__ */
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+template<class T, int texType = cudaTextureType1D, enum cudaTextureReadMode mode = cudaReadModeElementType>
+struct __device_builtin_texture_type__ texture : public textureReference
+{
+#if !defined(__CUDACC_RTC__)
+  __host__ texture(int                         norm  = 0,
+                   enum cudaTextureFilterMode  fMode = cudaFilterModePoint,
+                   enum cudaTextureAddressMode aMode = cudaAddressModeClamp)
+  {
+    normalized     = norm;
+    filterMode     = fMode;
+    addressMode[0] = aMode;
+    addressMode[1] = aMode;
+    addressMode[2] = aMode;
+    channelDesc    = cudaCreateChannelDesc<T>();
+    sRGB           = 0;
+  }
+
+  __host__ texture(int                          norm,
+                   enum cudaTextureFilterMode   fMode,
+                   enum cudaTextureAddressMode  aMode,
+                   struct cudaChannelFormatDesc desc)
+  {
+    normalized     = norm;
+    filterMode     = fMode;
+    addressMode[0] = aMode;
+    addressMode[1] = aMode;
+    addressMode[2] = aMode;
+    channelDesc    = desc;
+    sRGB           = 0;
+  }
+#endif /* !__CUDACC_RTC__ */
+};
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__CUDA_TEXTURE_TYPES_H__ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..50e427c39b164e6e145c3930cd66cc03806175a6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_atomic_functions.hpp
@@ -0,0 +1,224 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_ATOMIC_FUNCTIONS_HPP__)
+#define __DEVICE_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __DEVICE_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAdd(int *address, int val)
+{
+  return __iAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAdd(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicSub(int *address, int val)
+{
+  return __iAtomicAdd(address, (unsigned int)-(int)val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicSub(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd(address, (unsigned int)-(int)val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicExch(int *address, int val)
+{
+  return __iAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicExch(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ float atomicExch(float *address, float val)
+{
+  return __fAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMin(int *address, int val)
+{
+  return __iAtomicMin(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMin(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicMax(int *address, int val)
+{
+  return __iAtomicMax(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicMax(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicInc(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicDec(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicAnd(int *address, int val)
+{
+  return __iAtomicAnd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicAnd(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicOr(int *address, int val)
+{
+  return __iAtomicOr(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicOr(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicXor(int *address, int val)
+{
+  return __iAtomicXor(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicXor(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ int atomicCAS(int *address, int compare, int val)
+{
+  return __iAtomicCAS(address, compare, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned int atomicCAS(unsigned int *address, unsigned int compare, unsigned int val)
+{
+  return __uAtomicCAS(address, compare, val);
+}
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicAdd(unsigned long long int *address, unsigned long long int val)
+{
+  return __ullAtomicAdd(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicExch(unsigned long long int *address, unsigned long long int val)
+{
+  return __ullAtomicExch(address, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ unsigned long long int atomicCAS(unsigned long long int *address, unsigned long long int compare, unsigned long long int val)
+{
+  return __ullAtomicCAS(address, compare, val);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool any(bool cond)
+{
+  return (bool)__any((int)cond);
+}
+
+__DEVICE_ATOMIC_FUNCTIONS_DECL__ bool all(bool cond)
+{
+  return (bool)__all((int)cond);
+}
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __DEVICE_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__DEVICE_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..82b25e59b40aeaf1e475ff3179e49640a44918b8
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_double_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("device_double_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "device_double_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/device_double_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DEVICE_DOUBLE_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f552db8faab7d21e90e06a1ea2184a5563d3bf2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/device_launch_parameters.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright 1993-2012 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DEVICE_LAUNCH_PARAMETERS_H__)
+#define __DEVICE_LAUNCH_PARAMETERS_H__
+
+#include "vector_types.h"
+
+#if !defined(__STORAGE__)
+
+#if defined(__CUDACC_RTC__)
+#define __STORAGE__ \
+        extern const __device__
+#else /* !__CUDACC_RTC__ */
+#define __STORAGE__ \
+        extern const
+#endif /* __CUDACC_RTC__ */
+
+#endif /* __STORAGE__ */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+uint3 __device_builtin__ __STORAGE__ threadIdx;
+uint3 __device_builtin__ __STORAGE__ blockIdx;
+dim3 __device_builtin__ __STORAGE__ blockDim;
+dim3 __device_builtin__ __STORAGE__ gridDim;
+int __device_builtin__ __STORAGE__ warpSize;
+
+#undef __STORAGE__
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#if !defined(__cudaGet_threadIdx)
+
+#define __cudaGet_threadIdx() \
+        threadIdx
+
+#endif /* __cudaGet_threadIdx */
+
+#if !defined(__cudaGet_blockIdx)
+
+#define __cudaGet_blockIdx() \
+        blockIdx
+
+#endif /* __cudaGet_blockIdx */
+
+#if !defined(__cudaGet_blockDim)
+
+#define __cudaGet_blockDim() \
+        blockDim
+
+#endif /* __cudaGet_blockDim */
+
+#if !defined(__cudaGet_gridDim)
+
+#define __cudaGet_gridDim() \
+        gridDim
+
+#endif /* __cudaGet_gridDim */
+
+#if !defined(__cudaGet_warpSize)
+
+#define __cudaGet_warpSize() \
+        warpSize
+
+#endif /* __cudaGet_warpSize */
+
+#endif /* !__DEVICE_LAUNCH_PARAMETERS_H__ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..94767974220594550d496cad4d14c45349b27737
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_functions.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DRIVER_FUNCTIONS_H__)
+#define __DRIVER_FUNCTIONS_H__
+
+#include "builtin_types.h"
+#include "crt/host_defines.h"
+#include "driver_types.h"
+
+/**
+ * \addtogroup CUDART_MEMORY
+ *
+ * @{
+ */
+
+/**
+ * \brief Returns a cudaPitchedPtr based on input parameters
+ *
+ * Returns a ::cudaPitchedPtr based on the specified input parameters \p d,
+ * \p p, \p xsz, and \p ysz.
+ *
+ * \param d   - Pointer to allocated memory
+ * \param p   - Pitch of allocated memory in bytes
+ * \param xsz - Logical width of allocation in elements
+ * \param ysz - Logical height of allocation in elements
+ *
+ * \return
+ * ::cudaPitchedPtr specified by \p d, \p p, \p xsz, and \p ysz
+ *
+ * \sa make_cudaExtent, make_cudaPos
+ */
+static __inline__ __host__ struct cudaPitchedPtr make_cudaPitchedPtr(void *d, size_t p, size_t xsz, size_t ysz) 
+{
+  struct cudaPitchedPtr s;
+
+  s.ptr   = d;
+  s.pitch = p;
+  s.xsize = xsz;
+  s.ysize = ysz;
+
+  return s;
+}
+
+/**
+ * \brief Returns a cudaPos based on input parameters
+ *
+ * Returns a ::cudaPos based on the specified input parameters \p x,
+ * \p y, and \p z.
+ *
+ * \param x - X position
+ * \param y - Y position
+ * \param z - Z position
+ *
+ * \return
+ * ::cudaPos specified by \p x, \p y, and \p z
+ *
+ * \sa make_cudaExtent, make_cudaPitchedPtr
+ */
+static __inline__ __host__ struct cudaPos make_cudaPos(size_t x, size_t y, size_t z) 
+{
+  struct cudaPos p;
+
+  p.x = x;
+  p.y = y;
+  p.z = z;
+
+  return p;
+}
+
+/**
+ * \brief Returns a cudaExtent based on input parameters
+ *
+ * Returns a ::cudaExtent based on the specified input parameters \p w,
+ * \p h, and \p d.
+ *
+ * \param w - Width in elements when referring to array memory, in bytes when referring to linear memory
+ * \param h - Height in elements
+ * \param d - Depth in elements
+ *
+ * \return
+ * ::cudaExtent specified by \p w, \p h, and \p d
+ *
+ * \sa make_cudaPitchedPtr, make_cudaPos
+ */
+static __inline__ __host__ struct cudaExtent make_cudaExtent(size_t w, size_t h, size_t d) 
+{
+  struct cudaExtent e;
+
+  e.width  = w;
+  e.height = h;
+  e.depth  = d;
+
+  return e;
+}
+
+/** @} */ /* END CUDART_MEMORY */
+
+#endif /* !__DRIVER_FUNCTIONS_H__ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_types.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..47b54f94d6a1dcd278ddda0dd0fa5a4ca866a5ff
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/driver_types.h
@@ -0,0 +1,3093 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__DRIVER_TYPES_H__)
+#define __DRIVER_TYPES_H__
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__
+#endif
+
+#ifndef __DOXYGEN_ONLY__
+#include "crt/host_defines.h"
+#endif
+#include "vector_types.h"
+
+
+
+/**
+ * \defgroup CUDART_TYPES Data types used by CUDA Runtime
+ * \ingroup CUDART
+ *
+ * @{
+ */
+
+/*******************************************************************************
+*                                                                              *
+*  TYPE DEFINITIONS USED BY RUNTIME API                                        *
+*                                                                              *
+*******************************************************************************/
+
+#if !defined(__CUDA_INTERNAL_COMPILATION__)
+
+#if !defined(__CUDACC_RTC__)
+#include <limits.h>
+#include <stddef.h>
+#endif /* !defined(__CUDACC_RTC__) */
+
+#define cudaHostAllocDefault                0x00  /**< Default page-locked allocation flag */
+#define cudaHostAllocPortable               0x01  /**< Pinned memory accessible by all CUDA contexts */
+#define cudaHostAllocMapped                 0x02  /**< Map allocation into device space */
+#define cudaHostAllocWriteCombined          0x04  /**< Write-combined memory */
+
+#define cudaHostRegisterDefault             0x00  /**< Default host memory registration flag */
+#define cudaHostRegisterPortable            0x01  /**< Pinned memory accessible by all CUDA contexts */
+#define cudaHostRegisterMapped              0x02  /**< Map registered memory into device space */
+#define cudaHostRegisterIoMemory            0x04  /**< Memory-mapped I/O space */
+#define cudaHostRegisterReadOnly            0x08  /**< Memory-mapped read-only */
+
+#define cudaPeerAccessDefault               0x00  /**< Default peer addressing enable flag */
+
+#define cudaStreamDefault                   0x00  /**< Default stream flag */
+#define cudaStreamNonBlocking               0x01  /**< Stream does not synchronize with stream 0 (the NULL stream) */
+
+ /**
+ * Legacy stream handle
+ *
+ * Stream handle that can be passed as a cudaStream_t to use an implicit stream
+ * with legacy synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define cudaStreamLegacy                    ((cudaStream_t)0x1)
+
+/**
+ * Per-thread stream handle
+ *
+ * Stream handle that can be passed as a cudaStream_t to use an implicit stream
+ * with per-thread synchronization behavior.
+ *
+ * See details of the \link_sync_behavior
+ */
+#define cudaStreamPerThread                 ((cudaStream_t)0x2)
+
+#define cudaEventDefault                    0x00  /**< Default event flag */
+#define cudaEventBlockingSync               0x01  /**< Event uses blocking synchronization */
+#define cudaEventDisableTiming              0x02  /**< Event will not record timing data */
+#define cudaEventInterprocess               0x04  /**< Event is suitable for interprocess use. cudaEventDisableTiming must be set */
+
+#define cudaEventRecordDefault              0x00  /**< Default event record flag */
+#define cudaEventRecordExternal             0x01  /**< Event is captured in the graph as an external event node when performing stream capture */
+
+#define cudaEventWaitDefault                0x00  /**< Default event wait flag */
+#define cudaEventWaitExternal               0x01  /**< Event is captured in the graph as an external event node when performing stream capture */
+
+#define cudaDeviceScheduleAuto              0x00  /**< Device flag - Automatic scheduling */
+#define cudaDeviceScheduleSpin              0x01  /**< Device flag - Spin default scheduling */
+#define cudaDeviceScheduleYield             0x02  /**< Device flag - Yield default scheduling */
+#define cudaDeviceScheduleBlockingSync      0x04  /**< Device flag - Use blocking synchronization */
+#define cudaDeviceBlockingSync              0x04  /**< Device flag - Use blocking synchronization 
+                                                    *  \deprecated This flag was deprecated as of CUDA 4.0 and
+                                                    *  replaced with ::cudaDeviceScheduleBlockingSync. */
+#define cudaDeviceScheduleMask              0x07  /**< Device schedule flags mask */
+#define cudaDeviceMapHost                   0x08  /**< Device flag - Support mapped pinned allocations */
+#define cudaDeviceLmemResizeToMax           0x10  /**< Device flag - Keep local memory allocation after launch */
+#define cudaDeviceMask                      0x1f  /**< Device flags mask */
+
+#define cudaArrayDefault                    0x00  /**< Default CUDA array allocation flag */
+#define cudaArrayLayered                    0x01  /**< Must be set in cudaMalloc3DArray to create a layered CUDA array */
+#define cudaArraySurfaceLoadStore           0x02  /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to bind surfaces to the CUDA array */
+#define cudaArrayCubemap                    0x04  /**< Must be set in cudaMalloc3DArray to create a cubemap CUDA array */
+#define cudaArrayTextureGather              0x08  /**< Must be set in cudaMallocArray or cudaMalloc3DArray in order to perform texture gather operations on the CUDA array */
+#define cudaArrayColorAttachment            0x20  /**< Must be set in cudaExternalMemoryGetMappedMipmappedArray if the mipmapped array is used as a color target in a graphics API */
+#define cudaArraySparse                     0x40  /**< Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a sparse CUDA array or CUDA mipmapped array */
+#define cudaArrayDeferredMapping            0x80  /**< Must be set in cudaMallocArray, cudaMalloc3DArray or cudaMallocMipmappedArray in order to create a deferred mapping CUDA array or CUDA mipmapped array */
+
+#define cudaIpcMemLazyEnablePeerAccess      0x01  /**< Automatically enable peer access between remote devices as needed */
+
+#define cudaMemAttachGlobal                 0x01  /**< Memory can be accessed by any stream on any device*/
+#define cudaMemAttachHost                   0x02  /**< Memory cannot be accessed by any stream on any device */
+#define cudaMemAttachSingle                 0x04  /**< Memory can only be accessed by a single stream on the associated device */
+
+#define cudaOccupancyDefault                0x00  /**< Default behavior */
+#define cudaOccupancyDisableCachingOverride 0x01  /**< Assume global caching is enabled and cannot be automatically turned off */
+
+#define cudaCpuDeviceId                     ((int)-1) /**< Device id that represents the CPU */
+#define cudaInvalidDeviceId                 ((int)-2) /**< Device id that represents an invalid device */
+
+/**
+ * If set, each kernel launched as part of ::cudaLaunchCooperativeKernelMultiDevice only
+ * waits for prior work in the stream corresponding to that GPU to complete before the
+ * kernel begins execution.
+ */
+#define cudaCooperativeLaunchMultiDeviceNoPreSync  0x01
+
+/**
+ * If set, any subsequent work pushed in a stream that participated in a call to
+ * ::cudaLaunchCooperativeKernelMultiDevice will only wait for the kernel launched on
+ * the GPU corresponding to that stream to complete before it begins execution.
+ */
+#define cudaCooperativeLaunchMultiDeviceNoPostSync 0x02
+
+#endif /* !__CUDA_INTERNAL_COMPILATION__ */
+
+/** \cond impl_private */
+#if defined(__DOXYGEN_ONLY__) || defined(CUDA_ENABLE_DEPRECATED)
+#define __CUDA_DEPRECATED
+#elif defined(_MSC_VER)
+#define __CUDA_DEPRECATED __declspec(deprecated)
+#elif defined(__GNUC__)
+#define __CUDA_DEPRECATED __attribute__((deprecated))
+#else
+#define __CUDA_DEPRECATED
+#endif
+/** \endcond impl_private */
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * CUDA error types
+ */
+enum __device_builtin__ cudaError
+{
+    /**
+     * The API call returned with no errors. In the case of query calls, this
+     * also means that the operation being queried is complete (see
+     * ::cudaEventQuery() and ::cudaStreamQuery()).
+     */
+    cudaSuccess                           =      0,
+  
+    /**
+     * This indicates that one or more of the parameters passed to the API call
+     * is not within an acceptable range of values.
+     */
+    cudaErrorInvalidValue                 =     1,
+  
+    /**
+     * The API call failed because it was unable to allocate enough memory to
+     * perform the requested operation.
+     */
+    cudaErrorMemoryAllocation             =      2,
+  
+    /**
+     * The API call failed because the CUDA driver and runtime could not be
+     * initialized.
+     */
+    cudaErrorInitializationError          =      3,
+  
+    /**
+     * This indicates that a CUDA Runtime API call cannot be executed because
+     * it is being called during process shut down, at a point in time after
+     * CUDA driver has been unloaded.
+     */
+    cudaErrorCudartUnloading              =     4,
+
+    /**
+     * This indicates profiler is not initialized for this run. This can
+     * happen when the application is running with external profiling tools
+     * like visual profiler.
+     */
+    cudaErrorProfilerDisabled             =     5,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to attempt to enable/disable the profiling via ::cudaProfilerStart or
+     * ::cudaProfilerStop without initialization.
+     */
+    cudaErrorProfilerNotInitialized       =     6,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cudaProfilerStart() when profiling is already enabled.
+     */
+    cudaErrorProfilerAlreadyStarted       =     7,
+
+    /**
+     * \deprecated
+     * This error return is deprecated as of CUDA 5.0. It is no longer an error
+     * to call cudaProfilerStop() when profiling is already disabled.
+     */
+     cudaErrorProfilerAlreadyStopped       =    8,
+  
+    /**
+     * This indicates that a kernel launch is requesting resources that can
+     * never be satisfied by the current device. Requesting more shared memory
+     * per block than the device supports will trigger this error, as will
+     * requesting too many threads or blocks. See ::cudaDeviceProp for more
+     * device limitations.
+     */
+    cudaErrorInvalidConfiguration         =      9,
+  
+    /**
+     * This indicates that one or more of the pitch-related parameters passed
+     * to the API call is not within the acceptable range for pitch.
+     */
+    cudaErrorInvalidPitchValue            =     12,
+  
+    /**
+     * This indicates that the symbol name/identifier passed to the API call
+     * is not a valid name or identifier.
+     */
+    cudaErrorInvalidSymbol                =     13,
+  
+    /**
+     * This indicates that at least one host pointer passed to the API call is
+     * not a valid host pointer.
+     * \deprecated
+     * This error return is deprecated as of CUDA 10.1.
+     */
+    cudaErrorInvalidHostPointer           =     16,
+  
+    /**
+     * This indicates that at least one device pointer passed to the API call is
+     * not a valid device pointer.
+     * \deprecated
+     * This error return is deprecated as of CUDA 10.1.
+     */
+    cudaErrorInvalidDevicePointer         =     17,
+  
+    /**
+     * This indicates that the texture passed to the API call is not a valid
+     * texture.
+     */
+    cudaErrorInvalidTexture               =     18,
+  
+    /**
+     * This indicates that the texture binding is not valid. This occurs if you
+     * call ::cudaGetTextureAlignmentOffset() with an unbound texture.
+     */
+    cudaErrorInvalidTextureBinding        =     19,
+  
+    /**
+     * This indicates that the channel descriptor passed to the API call is not
+     * valid. This occurs if the format is not one of the formats specified by
+     * ::cudaChannelFormatKind, or if one of the dimensions is invalid.
+     */
+    cudaErrorInvalidChannelDescriptor     =     20,
+  
+    /**
+     * This indicates that the direction of the memcpy passed to the API call is
+     * not one of the types specified by ::cudaMemcpyKind.
+     */
+    cudaErrorInvalidMemcpyDirection       =     21,
+  
+    /**
+     * This indicated that the user has taken the address of a constant variable,
+     * which was forbidden up until the CUDA 3.1 release.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Variables in constant
+     * memory may now have their address taken by the runtime via
+     * ::cudaGetSymbolAddress().
+     */
+    cudaErrorAddressOfConstant            =     22,
+  
+    /**
+     * This indicated that a texture fetch was not able to be performed.
+     * This was previously used for device emulation of texture operations.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorTextureFetchFailed           =     23,
+  
+    /**
+     * This indicated that a texture was not bound for access.
+     * This was previously used for device emulation of texture operations.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorTextureNotBound              =     24,
+  
+    /**
+     * This indicated that a synchronization operation had failed.
+     * This was previously used for some device emulation functions.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorSynchronizationError         =     25,
+  
+    /**
+     * This indicates that a non-float texture was being accessed with linear
+     * filtering. This is not supported by CUDA.
+     */
+    cudaErrorInvalidFilterSetting         =     26,
+  
+    /**
+     * This indicates that an attempt was made to read a non-float texture as a
+     * normalized float. This is not supported by CUDA.
+     */
+    cudaErrorInvalidNormSetting           =     27,
+  
+    /**
+     * Mixing of device and device emulation code was not allowed.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorMixedDeviceExecution         =     28,
+
+    /**
+     * This indicates that the API call is not yet implemented. Production
+     * releases of CUDA will never return this error.
+     * \deprecated
+     * This error return is deprecated as of CUDA 4.1.
+     */
+    cudaErrorNotYetImplemented            =     31,
+  
+    /**
+     * This indicated that an emulated device pointer exceeded the 32-bit address
+     * range.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorMemoryValueTooLarge          =     32,
+  
+    /**
+     * This indicates that the CUDA driver that the application has loaded is a
+     * stub library. Applications that run with the stub rather than a real
+     * driver loaded will result in CUDA API returning this error.
+     */
+    cudaErrorStubLibrary                  =     34,
+
+    /**
+     * This indicates that the installed NVIDIA CUDA driver is older than the
+     * CUDA runtime library. This is not a supported configuration. Users should
+     * install an updated NVIDIA display driver to allow the application to run.
+     */
+    cudaErrorInsufficientDriver           =     35,
+
+    /**
+     * This indicates that the API call requires a newer CUDA driver than the one
+     * currently installed. Users should install an updated NVIDIA CUDA driver
+     * to allow the API call to succeed.
+     */
+    cudaErrorCallRequiresNewerDriver      =     36,
+  
+    /**
+     * This indicates that the surface passed to the API call is not a valid
+     * surface.
+     */
+    cudaErrorInvalidSurface               =     37,
+  
+    /**
+     * This indicates that multiple global or constant variables (across separate
+     * CUDA source files in the application) share the same string name.
+     */
+    cudaErrorDuplicateVariableName        =     43,
+  
+    /**
+     * This indicates that multiple textures (across separate CUDA source
+     * files in the application) share the same string name.
+     */
+    cudaErrorDuplicateTextureName         =     44,
+  
+    /**
+     * This indicates that multiple surfaces (across separate CUDA source
+     * files in the application) share the same string name.
+     */
+    cudaErrorDuplicateSurfaceName         =     45,
+  
+    /**
+     * This indicates that all CUDA devices are busy or unavailable at the current
+     * time. Devices are often busy/unavailable due to use of
+     * ::cudaComputeModeProhibited, ::cudaComputeModeExclusiveProcess, or when long
+     * running CUDA kernels have filled up the GPU and are blocking new work
+     * from starting. They can also be unavailable due to memory constraints
+     * on a device that already has active CUDA work being performed.
+     */
+    cudaErrorDevicesUnavailable           =     46,
+  
+    /**
+     * This indicates that the current context is not compatible with this
+     * the CUDA Runtime. This can only occur if you are using CUDA
+     * Runtime/Driver interoperability and have created an existing Driver
+     * context using the driver API. The Driver context may be incompatible
+     * either because the Driver context was created using an older version 
+     * of the API, because the Runtime API call expects a primary driver 
+     * context and the Driver context is not primary, or because the Driver 
+     * context has been destroyed. Please see \ref CUDART_DRIVER "Interactions 
+     * with the CUDA Driver API" for more information.
+     */
+    cudaErrorIncompatibleDriverContext    =     49,
+    
+    /**
+     * The device function being invoked (usually via ::cudaLaunchKernel()) was not
+     * previously configured via the ::cudaConfigureCall() function.
+     */
+    cudaErrorMissingConfiguration         =      52,
+  
+    /**
+     * This indicated that a previous kernel launch failed. This was previously
+     * used for device emulation of kernel launches.
+     * \deprecated
+     * This error return is deprecated as of CUDA 3.1. Device emulation mode was
+     * removed with the CUDA 3.1 release.
+     */
+    cudaErrorPriorLaunchFailure           =      53,
+
+    /**
+     * This error indicates that a device runtime grid launch did not occur 
+     * because the depth of the child grid would exceed the maximum supported
+     * number of nested grid launches. 
+     */
+    cudaErrorLaunchMaxDepthExceeded       =     65,
+
+    /**
+     * This error indicates that a grid launch did not occur because the kernel 
+     * uses file-scoped textures which are unsupported by the device runtime. 
+     * Kernels launched via the device runtime only support textures created with 
+     * the Texture Object API's.
+     */
+    cudaErrorLaunchFileScopedTex          =     66,
+
+    /**
+     * This error indicates that a grid launch did not occur because the kernel 
+     * uses file-scoped surfaces which are unsupported by the device runtime.
+     * Kernels launched via the device runtime only support surfaces created with
+     * the Surface Object API's.
+     */
+    cudaErrorLaunchFileScopedSurf         =     67,
+
+    /**
+     * This error indicates that a call to ::cudaDeviceSynchronize made from
+     * the device runtime failed because the call was made at grid depth greater
+     * than than either the default (2 levels of grids) or user specified device 
+     * limit ::cudaLimitDevRuntimeSyncDepth. To be able to synchronize on 
+     * launched grids at a greater depth successfully, the maximum nested 
+     * depth at which ::cudaDeviceSynchronize will be called must be specified 
+     * with the ::cudaLimitDevRuntimeSyncDepth limit to the ::cudaDeviceSetLimit
+     * api before the host-side launch of a kernel using the device runtime. 
+     * Keep in mind that additional levels of sync depth require the runtime 
+     * to reserve large amounts of device memory that cannot be used for 
+     * user allocations.
+     */
+    cudaErrorSyncDepthExceeded            =     68,
+
+    /**
+     * This error indicates that a device runtime grid launch failed because
+     * the launch would exceed the limit ::cudaLimitDevRuntimePendingLaunchCount.
+     * For this launch to proceed successfully, ::cudaDeviceSetLimit must be
+     * called to set the ::cudaLimitDevRuntimePendingLaunchCount to be higher 
+     * than the upper bound of outstanding launches that can be issued to the
+     * device runtime. Keep in mind that raising the limit of pending device
+     * runtime launches will require the runtime to reserve device memory that
+     * cannot be used for user allocations.
+     */
+    cudaErrorLaunchPendingCountExceeded   =     69,
+  
+    /**
+     * The requested device function does not exist or is not compiled for the
+     * proper device architecture.
+     */
+    cudaErrorInvalidDeviceFunction        =      98,
+  
+    /**
+     * This indicates that no CUDA-capable devices were detected by the installed
+     * CUDA driver.
+     */
+    cudaErrorNoDevice                     =     100,
+  
+    /**
+     * This indicates that the device ordinal supplied by the user does not
+     * correspond to a valid CUDA device or that the action requested is
+     * invalid for the specified device.
+     */
+    cudaErrorInvalidDevice                =     101,
+
+    /**
+     * This indicates that the device doesn't have a valid Grid License.
+     */
+    cudaErrorDeviceNotLicensed            =     102,
+
+   /**
+    * By default, the CUDA runtime may perform a minimal set of self-tests,
+    * as well as CUDA driver tests, to establish the validity of both.
+    * Introduced in CUDA 11.2, this error return indicates that at least one
+    * of these tests has failed and the validity of either the runtime
+    * or the driver could not be established.
+    */
+   cudaErrorSoftwareValidityNotEstablished  =     103,
+
+    /**
+     * This indicates an internal startup failure in the CUDA runtime.
+     */
+    cudaErrorStartupFailure               =    127,
+  
+    /**
+     * This indicates that the device kernel image is invalid.
+     */
+    cudaErrorInvalidKernelImage           =     200,
+
+    /**
+     * This most frequently indicates that there is no context bound to the
+     * current thread. This can also be returned if the context passed to an
+     * API call is not a valid handle (such as a context that has had
+     * ::cuCtxDestroy() invoked on it). This can also be returned if a user
+     * mixes different API versions (i.e. 3010 context with 3020 API calls).
+     * See ::cuCtxGetApiVersion() for more details.
+     */
+    cudaErrorDeviceUninitialized          =     201,
+
+    /**
+     * This indicates that the buffer object could not be mapped.
+     */
+    cudaErrorMapBufferObjectFailed        =     205,
+  
+    /**
+     * This indicates that the buffer object could not be unmapped.
+     */
+    cudaErrorUnmapBufferObjectFailed      =     206,
+
+    /**
+     * This indicates that the specified array is currently mapped and thus
+     * cannot be destroyed.
+     */
+    cudaErrorArrayIsMapped                =     207,
+
+    /**
+     * This indicates that the resource is already mapped.
+     */
+    cudaErrorAlreadyMapped                =     208,
+  
+    /**
+     * This indicates that there is no kernel image available that is suitable
+     * for the device. This can occur when a user specifies code generation
+     * options for a particular CUDA source file that do not include the
+     * corresponding device configuration.
+     */
+    cudaErrorNoKernelImageForDevice       =     209,
+
+    /**
+     * This indicates that a resource has already been acquired.
+     */
+    cudaErrorAlreadyAcquired              =     210,
+
+    /**
+     * This indicates that a resource is not mapped.
+     */
+    cudaErrorNotMapped                    =     211,
+
+    /**
+     * This indicates that a mapped resource is not available for access as an
+     * array.
+     */
+    cudaErrorNotMappedAsArray             =     212,
+
+    /**
+     * This indicates that a mapped resource is not available for access as a
+     * pointer.
+     */
+    cudaErrorNotMappedAsPointer           =     213,
+  
+    /**
+     * This indicates that an uncorrectable ECC error was detected during
+     * execution.
+     */
+    cudaErrorECCUncorrectable             =     214,
+  
+    /**
+     * This indicates that the ::cudaLimit passed to the API call is not
+     * supported by the active device.
+     */
+    cudaErrorUnsupportedLimit             =     215,
+    
+    /**
+     * This indicates that a call tried to access an exclusive-thread device that 
+     * is already in use by a different thread.
+     */
+    cudaErrorDeviceAlreadyInUse           =     216,
+
+    /**
+     * This error indicates that P2P access is not supported across the given
+     * devices.
+     */
+    cudaErrorPeerAccessUnsupported        =     217,
+
+    /**
+     * A PTX compilation failed. The runtime may fall back to compiling PTX if
+     * an application does not contain a suitable binary for the current device.
+     */
+    cudaErrorInvalidPtx                   =     218,
+
+    /**
+     * This indicates an error with the OpenGL or DirectX context.
+     */
+    cudaErrorInvalidGraphicsContext       =     219,
+
+    /**
+     * This indicates that an uncorrectable NVLink error was detected during the
+     * execution.
+     */
+    cudaErrorNvlinkUncorrectable          =     220,
+
+    /**
+     * This indicates that the PTX JIT compiler library was not found. The JIT Compiler
+     * library is used for PTX compilation. The runtime may fall back to compiling PTX
+     * if an application does not contain a suitable binary for the current device.
+     */
+    cudaErrorJitCompilerNotFound          =     221,
+
+    /**
+     * This indicates that the provided PTX was compiled with an unsupported toolchain.
+     * The most common reason for this, is the PTX was generated by a compiler newer
+     * than what is supported by the CUDA driver and PTX JIT compiler.
+     */
+    cudaErrorUnsupportedPtxVersion        =     222,
+
+    /**
+     * This indicates that the JIT compilation was disabled. The JIT compilation compiles
+     * PTX. The runtime may fall back to compiling PTX if an application does not contain
+     * a suitable binary for the current device.
+     */
+    cudaErrorJitCompilationDisabled       =     223,
+
+    /**
+     * This indicates that the provided execution affinity is not supported by the device.
+     */
+    cudaErrorUnsupportedExecAffinity      =     224,
+
+    /**
+     * This indicates that the device kernel source is invalid.
+     */
+    cudaErrorInvalidSource                =     300,
+
+    /**
+     * This indicates that the file specified was not found.
+     */
+    cudaErrorFileNotFound                 =     301,
+  
+    /**
+     * This indicates that a link to a shared object failed to resolve.
+     */
+    cudaErrorSharedObjectSymbolNotFound   =     302,
+  
+    /**
+     * This indicates that initialization of a shared object failed.
+     */
+    cudaErrorSharedObjectInitFailed       =     303,
+
+    /**
+     * This error indicates that an OS call failed.
+     */
+    cudaErrorOperatingSystem              =     304,
+  
+    /**
+     * This indicates that a resource handle passed to the API call was not
+     * valid. Resource handles are opaque types like ::cudaStream_t and
+     * ::cudaEvent_t.
+     */
+    cudaErrorInvalidResourceHandle        =     400,
+
+    /**
+     * This indicates that a resource required by the API call is not in a
+     * valid state to perform the requested operation.
+     */
+    cudaErrorIllegalState                 =     401,
+
+    /**
+     * This indicates that a named symbol was not found. Examples of symbols
+     * are global/constant variable names, driver function names, texture names,
+     * and surface names.
+     */
+    cudaErrorSymbolNotFound               =     500,
+  
+    /**
+     * This indicates that asynchronous operations issued previously have not
+     * completed yet. This result is not actually an error, but must be indicated
+     * differently than ::cudaSuccess (which indicates completion). Calls that
+     * may return this value include ::cudaEventQuery() and ::cudaStreamQuery().
+     */
+    cudaErrorNotReady                     =     600,
+
+    /**
+     * The device encountered a load or store instruction on an invalid memory address.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorIllegalAddress               =     700,
+  
+    /**
+     * This indicates that a launch did not occur because it did not have
+     * appropriate resources. Although this error is similar to
+     * ::cudaErrorInvalidConfiguration, this error usually indicates that the
+     * user has attempted to pass too many arguments to the device kernel, or the
+     * kernel launch specifies too many threads for the kernel's register count.
+     */
+    cudaErrorLaunchOutOfResources         =      701,
+  
+    /**
+     * This indicates that the device kernel took too long to execute. This can
+     * only occur if timeouts are enabled - see the device property
+     * \ref ::cudaDeviceProp::kernelExecTimeoutEnabled "kernelExecTimeoutEnabled"
+     * for more information.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorLaunchTimeout                =      702,
+
+    /**
+     * This error indicates a kernel launch that uses an incompatible texturing
+     * mode.
+     */
+    cudaErrorLaunchIncompatibleTexturing  =     703,
+      
+    /**
+     * This error indicates that a call to ::cudaDeviceEnablePeerAccess() is
+     * trying to re-enable peer addressing on from a context which has already
+     * had peer addressing enabled.
+     */
+    cudaErrorPeerAccessAlreadyEnabled     =     704,
+    
+    /**
+     * This error indicates that ::cudaDeviceDisablePeerAccess() is trying to 
+     * disable peer addressing which has not been enabled yet via 
+     * ::cudaDeviceEnablePeerAccess().
+     */
+    cudaErrorPeerAccessNotEnabled         =     705,
+  
+    /**
+     * This indicates that the user has called ::cudaSetValidDevices(),
+     * ::cudaSetDeviceFlags(), ::cudaD3D9SetDirect3DDevice(),
+     * ::cudaD3D10SetDirect3DDevice, ::cudaD3D11SetDirect3DDevice(), or
+     * ::cudaVDPAUSetVDPAUDevice() after initializing the CUDA runtime by
+     * calling non-device management operations (allocating memory and
+     * launching kernels are examples of non-device management operations).
+     * This error can also be returned if using runtime/driver
+     * interoperability and there is an existing ::CUcontext active on the
+     * host thread.
+     */
+    cudaErrorSetOnActiveProcess           =     708,
+
+    /**
+     * This error indicates that the context current to the calling thread
+     * has been destroyed using ::cuCtxDestroy, or is a primary context which
+     * has not yet been initialized.
+     */
+    cudaErrorContextIsDestroyed           =     709,
+
+    /**
+     * An assert triggered in device code during kernel execution. The device
+     * cannot be used again. All existing allocations are invalid. To continue
+     * using CUDA, the process must be terminated and relaunched.
+     */
+    cudaErrorAssert                        =    710,
+  
+    /**
+     * This error indicates that the hardware resources required to enable
+     * peer access have been exhausted for one or more of the devices 
+     * passed to ::cudaEnablePeerAccess().
+     */
+    cudaErrorTooManyPeers                 =     711,
+  
+    /**
+     * This error indicates that the memory range passed to ::cudaHostRegister()
+     * has already been registered.
+     */
+    cudaErrorHostMemoryAlreadyRegistered  =     712,
+        
+    /**
+     * This error indicates that the pointer passed to ::cudaHostUnregister()
+     * does not correspond to any currently registered memory region.
+     */
+    cudaErrorHostMemoryNotRegistered      =     713,
+
+    /**
+     * Device encountered an error in the call stack during kernel execution,
+     * possibly due to stack corruption or exceeding the stack size limit.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorHardwareStackError           =     714,
+
+    /**
+     * The device encountered an illegal instruction during kernel execution
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorIllegalInstruction           =     715,
+
+    /**
+     * The device encountered a load or store instruction
+     * on a memory address which is not aligned.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorMisalignedAddress            =     716,
+
+    /**
+     * While executing a kernel, the device encountered an instruction
+     * which can only operate on memory locations in certain address spaces
+     * (global, shared, or local), but was supplied a memory address not
+     * belonging to an allowed address space.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorInvalidAddressSpace          =     717,
+
+    /**
+     * The device encountered an invalid program counter.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorInvalidPc                    =     718,
+  
+    /**
+     * An exception occurred on the device while executing a kernel. Common
+     * causes include dereferencing an invalid device pointer and accessing
+     * out of bounds shared memory. Less common cases can be system specific - more
+     * information about these cases can be found in the system specific user guide.
+     * This leaves the process in an inconsistent state and any further CUDA work
+     * will return the same error. To continue using CUDA, the process must be terminated
+     * and relaunched.
+     */
+    cudaErrorLaunchFailure                =      719,
+
+    /**
+     * This error indicates that the number of blocks launched per grid for a kernel that was
+     * launched via either ::cudaLaunchCooperativeKernel or ::cudaLaunchCooperativeKernelMultiDevice
+     * exceeds the maximum number of blocks as allowed by ::cudaOccupancyMaxActiveBlocksPerMultiprocessor
+     * or ::cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags times the number of multiprocessors
+     * as specified by the device attribute ::cudaDevAttrMultiProcessorCount.
+     */
+    cudaErrorCooperativeLaunchTooLarge    =     720,
+    
+    /**
+     * This error indicates the attempted operation is not permitted.
+     */
+    cudaErrorNotPermitted                 =     800,
+
+    /**
+     * This error indicates the attempted operation is not supported
+     * on the current system or device.
+     */
+    cudaErrorNotSupported                 =     801,
+
+    /**
+     * This error indicates that the system is not yet ready to start any CUDA
+     * work.  To continue using CUDA, verify the system configuration is in a
+     * valid state and all required driver daemons are actively running.
+     * More information about this error can be found in the system specific
+     * user guide.
+     */
+    cudaErrorSystemNotReady               =     802,
+
+    /**
+     * This error indicates that there is a mismatch between the versions of
+     * the display driver and the CUDA driver. Refer to the compatibility documentation
+     * for supported versions.
+     */
+    cudaErrorSystemDriverMismatch         =     803,
+
+    /**
+     * This error indicates that the system was upgraded to run with forward compatibility
+     * but the visible hardware detected by CUDA does not support this configuration.
+     * Refer to the compatibility documentation for the supported hardware matrix or ensure
+     * that only supported hardware is visible during initialization via the CUDA_VISIBLE_DEVICES
+     * environment variable.
+     */
+    cudaErrorCompatNotSupportedOnDevice   =     804,
+
+    /**
+     * This error indicates that the MPS client failed to connect to the MPS control daemon or the MPS server.
+     */
+    cudaErrorMpsConnectionFailed          =     805,
+
+    /**
+     * This error indicates that the remote procedural call between the MPS server and the MPS client failed.
+     */
+    cudaErrorMpsRpcFailure                =     806,
+
+    /**
+     * This error indicates that the MPS server is not ready to accept new MPS client requests.
+     * This error can be returned when the MPS server is in the process of recovering from a fatal failure.
+     */
+    cudaErrorMpsServerNotReady            =     807,
+
+    /**
+     * This error indicates that the hardware resources required to create MPS client have been exhausted.
+     */
+    cudaErrorMpsMaxClientsReached         =     808,
+
+    /**
+     * This error indicates the the hardware resources required to device connections have been exhausted.
+     */
+    cudaErrorMpsMaxConnectionsReached     =     809,
+
+    /**
+     * This error indicates that the MPS client has been terminated by the server. To continue using CUDA, the process must be terminated and relaunched.
+     */
+    cudaErrorMpsClientTerminated          =     810,
+
+    /**
+     * The operation is not permitted when the stream is capturing.
+     */
+    cudaErrorStreamCaptureUnsupported     =    900,
+
+    /**
+     * The current capture sequence on the stream has been invalidated due to
+     * a previous error.
+     */
+    cudaErrorStreamCaptureInvalidated     =    901,
+
+    /**
+     * The operation would have resulted in a merge of two independent capture
+     * sequences.
+     */
+    cudaErrorStreamCaptureMerge           =    902,
+
+    /**
+     * The capture was not initiated in this stream.
+     */
+    cudaErrorStreamCaptureUnmatched       =    903,
+
+    /**
+     * The capture sequence contains a fork that was not joined to the primary
+     * stream.
+     */
+    cudaErrorStreamCaptureUnjoined        =    904,
+
+    /**
+     * A dependency would have been created which crosses the capture sequence
+     * boundary. Only implicit in-stream ordering dependencies are allowed to
+     * cross the boundary.
+     */
+    cudaErrorStreamCaptureIsolation       =    905,
+
+    /**
+     * The operation would have resulted in a disallowed implicit dependency on
+     * a current capture sequence from cudaStreamLegacy.
+     */
+    cudaErrorStreamCaptureImplicit        =    906,
+
+    /**
+     * The operation is not permitted on an event which was last recorded in a
+     * capturing stream.
+     */
+    cudaErrorCapturedEvent                =    907,
+  
+    /**
+     * A stream capture sequence not initiated with the ::cudaStreamCaptureModeRelaxed
+     * argument to ::cudaStreamBeginCapture was passed to ::cudaStreamEndCapture in a
+     * different thread.
+     */
+    cudaErrorStreamCaptureWrongThread     =    908,
+
+    /**
+     * This indicates that the wait operation has timed out.
+     */
+    cudaErrorTimeout                      =    909,
+
+    /**
+     * This error indicates that the graph update was not performed because it included 
+     * changes which violated constraints specific to instantiated graph update.
+     */
+    cudaErrorGraphExecUpdateFailure       =    910,
+
+    /**
+     * This indicates that an async error has occurred in a device outside of CUDA.
+     * If CUDA was waiting for an external device's signal before consuming shared data,
+     * the external device signaled an error indicating that the data is not valid for
+     * consumption. This leaves the process in an inconsistent state and any further CUDA
+     * work will return the same error. To continue using CUDA, the process must be
+     * terminated and relaunched.
+     */
+    cudaErrorExternalDevice               =    911,
+
+    /**
+     * This indicates that a kernel launch error has occurred due to cluster
+     * misconfiguration.
+     */
+    cudaErrorInvalidClusterSize           =    912,
+
+    /**
+     * This indicates that an unknown internal error has occurred.
+     */
+    cudaErrorUnknown                      =    999,
+
+    /**
+     * Any unhandled CUDA driver error is added to this value and returned via
+     * the runtime. Production releases of CUDA should not return such errors.
+     * \deprecated
+     * This error return is deprecated as of CUDA 4.1.
+     */
+    cudaErrorApiFailureBase               =  10000
+};
+
+/**
+ * Channel format kind
+ */
+enum __device_builtin__ cudaChannelFormatKind
+{
+    cudaChannelFormatKindSigned                         =   0,      /**< Signed channel format */
+    cudaChannelFormatKindUnsigned                       =   1,      /**< Unsigned channel format */
+    cudaChannelFormatKindFloat                          =   2,      /**< Float channel format */
+    cudaChannelFormatKindNone                           =   3,      /**< No channel format */
+    cudaChannelFormatKindNV12                           =   4,      /**< Unsigned 8-bit integers, planar 4:2:0 YUV format */
+    cudaChannelFormatKindUnsignedNormalized8X1          =   5,      /**< 1 channel unsigned 8-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized8X2          =   6,      /**< 2 channel unsigned 8-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized8X4          =   7,      /**< 4 channel unsigned 8-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized16X1         =   8,      /**< 1 channel unsigned 16-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized16X2         =   9,      /**< 2 channel unsigned 16-bit normalized integer */
+    cudaChannelFormatKindUnsignedNormalized16X4         =   10,     /**< 4 channel unsigned 16-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized8X1            =   11,     /**< 1 channel signed 8-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized8X2            =   12,     /**< 2 channel signed 8-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized8X4            =   13,     /**< 4 channel signed 8-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized16X1           =   14,     /**< 1 channel signed 16-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized16X2           =   15,     /**< 2 channel signed 16-bit normalized integer */
+    cudaChannelFormatKindSignedNormalized16X4           =   16,     /**< 4 channel signed 16-bit normalized integer */
+    cudaChannelFormatKindUnsignedBlockCompressed1       =   17,     /**< 4 channel unsigned normalized block-compressed (BC1 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed1SRGB   =   18,     /**< 4 channel unsigned normalized block-compressed (BC1 compression) format with sRGB encoding*/
+    cudaChannelFormatKindUnsignedBlockCompressed2       =   19,     /**< 4 channel unsigned normalized block-compressed (BC2 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed2SRGB   =   20,     /**< 4 channel unsigned normalized block-compressed (BC2 compression) format with sRGB encoding */
+    cudaChannelFormatKindUnsignedBlockCompressed3       =   21,     /**< 4 channel unsigned normalized block-compressed (BC3 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed3SRGB   =   22,     /**< 4 channel unsigned normalized block-compressed (BC3 compression) format with sRGB encoding */
+    cudaChannelFormatKindUnsignedBlockCompressed4       =   23,     /**< 1 channel unsigned normalized block-compressed (BC4 compression) format */
+    cudaChannelFormatKindSignedBlockCompressed4         =   24,     /**< 1 channel signed normalized block-compressed (BC4 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed5       =   25,     /**< 2 channel unsigned normalized block-compressed (BC5 compression) format */
+    cudaChannelFormatKindSignedBlockCompressed5         =   26,     /**< 2 channel signed normalized block-compressed (BC5 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed6H      =   27,     /**< 3 channel unsigned half-float block-compressed (BC6H compression) format */
+    cudaChannelFormatKindSignedBlockCompressed6H        =   28,     /**< 3 channel signed half-float block-compressed (BC6H compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed7       =   29,     /**< 4 channel unsigned normalized block-compressed (BC7 compression) format */
+    cudaChannelFormatKindUnsignedBlockCompressed7SRGB   =   30      /**< 4 channel unsigned normalized block-compressed (BC7 compression) format with sRGB encoding */
+};
+
+/**
+ * CUDA Channel format descriptor
+ */
+struct __device_builtin__ cudaChannelFormatDesc
+{
+    int                        x; /**< x */
+    int                        y; /**< y */
+    int                        z; /**< z */
+    int                        w; /**< w */
+    enum cudaChannelFormatKind f; /**< Channel format kind */
+};
+
+/**
+ * CUDA array
+ */
+typedef struct cudaArray *cudaArray_t;
+
+/**
+ * CUDA array (as source copy argument)
+ */
+typedef const struct cudaArray *cudaArray_const_t;
+
+struct cudaArray;
+
+/**
+ * CUDA mipmapped array
+ */
+typedef struct cudaMipmappedArray *cudaMipmappedArray_t;
+
+/**
+ * CUDA mipmapped array (as source argument)
+ */
+typedef const struct cudaMipmappedArray *cudaMipmappedArray_const_t;
+
+struct cudaMipmappedArray;
+
+/**
+ * Indicates that the layered sparse CUDA array or CUDA mipmapped array has a single mip tail region for all layers
+ */
+#define cudaArraySparsePropertiesSingleMipTail   0x1
+
+/**
+ * Sparse CUDA array and CUDA mipmapped array properties
+ */
+struct __device_builtin__ cudaArraySparseProperties {
+    struct {
+        unsigned int width;             /**< Tile width in elements */
+        unsigned int height;            /**< Tile height in elements */
+        unsigned int depth;             /**< Tile depth in elements */
+    } tileExtent;
+    unsigned int miptailFirstLevel;     /**< First mip level at which the mip tail begins */   
+    unsigned long long miptailSize;     /**< Total size of the mip tail. */
+    unsigned int flags;                 /**< Flags will either be zero or ::cudaArraySparsePropertiesSingleMipTail */
+    unsigned int reserved[4];
+};
+
+/**
+ * CUDA array and CUDA mipmapped array memory requirements
+ */
+struct __device_builtin__ cudaArrayMemoryRequirements {
+    size_t size;                    /**< Total size of the array. */
+    size_t alignment;               /**< Alignment necessary for mapping the array. */
+    unsigned int reserved[4];
+};
+
+/**
+ * CUDA memory types
+ */
+enum __device_builtin__ cudaMemoryType
+{
+    cudaMemoryTypeUnregistered = 0, /**< Unregistered memory */
+    cudaMemoryTypeHost         = 1, /**< Host memory */
+    cudaMemoryTypeDevice       = 2, /**< Device memory */
+    cudaMemoryTypeManaged      = 3  /**< Managed memory */
+};
+
+/**
+ * CUDA memory copy types
+ */
+enum __device_builtin__ cudaMemcpyKind
+{
+    cudaMemcpyHostToHost          =   0,      /**< Host   -> Host */
+    cudaMemcpyHostToDevice        =   1,      /**< Host   -> Device */
+    cudaMemcpyDeviceToHost        =   2,      /**< Device -> Host */
+    cudaMemcpyDeviceToDevice      =   3,      /**< Device -> Device */
+    cudaMemcpyDefault             =   4       /**< Direction of the transfer is inferred from the pointer values. Requires unified virtual addressing */
+};
+
+/**
+ * CUDA Pitched memory pointer
+ *
+ * \sa ::make_cudaPitchedPtr
+ */
+struct __device_builtin__ cudaPitchedPtr
+{
+    void   *ptr;      /**< Pointer to allocated memory */
+    size_t  pitch;    /**< Pitch of allocated memory in bytes */
+    size_t  xsize;    /**< Logical width of allocation in elements */
+    size_t  ysize;    /**< Logical height of allocation in elements */
+};
+
+/**
+ * CUDA extent
+ *
+ * \sa ::make_cudaExtent
+ */
+struct __device_builtin__ cudaExtent
+{
+    size_t width;     /**< Width in elements when referring to array memory, in bytes when referring to linear memory */
+    size_t height;    /**< Height in elements */
+    size_t depth;     /**< Depth in elements */
+};
+
+/**
+ * CUDA 3D position
+ *
+ * \sa ::make_cudaPos
+ */
+struct __device_builtin__ cudaPos
+{
+    size_t x;     /**< x */
+    size_t y;     /**< y */
+    size_t z;     /**< z */
+};
+
+/**
+ * CUDA 3D memory copying parameters
+ */
+struct __device_builtin__ cudaMemcpy3DParms
+{
+    cudaArray_t            srcArray;  /**< Source memory address */
+    struct cudaPos         srcPos;    /**< Source position offset */
+    struct cudaPitchedPtr  srcPtr;    /**< Pitched source memory address */
+  
+    cudaArray_t            dstArray;  /**< Destination memory address */
+    struct cudaPos         dstPos;    /**< Destination position offset */
+    struct cudaPitchedPtr  dstPtr;    /**< Pitched destination memory address */
+  
+    struct cudaExtent      extent;    /**< Requested memory copy size */
+    enum cudaMemcpyKind    kind;      /**< Type of transfer */
+};
+
+/**
+ * CUDA 3D cross-device memory copying parameters
+ */
+struct __device_builtin__ cudaMemcpy3DPeerParms
+{
+    cudaArray_t            srcArray;  /**< Source memory address */
+    struct cudaPos         srcPos;    /**< Source position offset */
+    struct cudaPitchedPtr  srcPtr;    /**< Pitched source memory address */
+    int                    srcDevice; /**< Source device */
+  
+    cudaArray_t            dstArray;  /**< Destination memory address */
+    struct cudaPos         dstPos;    /**< Destination position offset */
+    struct cudaPitchedPtr  dstPtr;    /**< Pitched destination memory address */
+    int                    dstDevice; /**< Destination device */
+  
+    struct cudaExtent      extent;    /**< Requested memory copy size */
+};
+
+/**
+ * CUDA Memset node parameters
+ */
+struct __device_builtin__  cudaMemsetParams {
+    void *dst;                              /**< Destination device pointer */
+    size_t pitch;                           /**< Pitch of destination device pointer. Unused if height is 1 */
+    unsigned int value;                     /**< Value to be set */
+    unsigned int elementSize;               /**< Size of each element in bytes. Must be 1, 2, or 4. */
+    size_t width;                           /**< Width of the row in elements */
+    size_t height;                          /**< Number of rows */
+};
+
+/**
+ * Specifies performance hint with ::cudaAccessPolicyWindow for hitProp and missProp members.
+ */
+enum __device_builtin__  cudaAccessProperty {
+    cudaAccessPropertyNormal = 0,       /**< Normal cache persistence. */
+    cudaAccessPropertyStreaming = 1,    /**< Streaming access is less likely to persit from cache. */
+    cudaAccessPropertyPersisting = 2    /**< Persisting access is more likely to persist in cache.*/
+};
+
+/**
+ * Specifies an access policy for a window, a contiguous extent of memory
+ * beginning at base_ptr and ending at base_ptr + num_bytes.
+ * Partition into many segments and assign segments such that.
+ * sum of "hit segments" / window == approx. ratio.
+ * sum of "miss segments" / window == approx 1-ratio.
+ * Segments and ratio specifications are fitted to the capabilities of
+ * the architecture.
+ * Accesses in a hit segment apply the hitProp access policy.
+ * Accesses in a miss segment apply the missProp access policy.
+ */
+struct __device_builtin__ cudaAccessPolicyWindow {
+    void *base_ptr;                     /**< Starting address of the access policy window. CUDA driver may align it. */
+    size_t num_bytes;                   /**< Size in bytes of the window policy. CUDA driver may restrict the maximum size and alignment. */
+    float hitRatio;                     /**< hitRatio specifies percentage of lines assigned hitProp, rest are assigned missProp. */
+    enum cudaAccessProperty hitProp;    /**< ::CUaccessProperty set for hit. */
+    enum cudaAccessProperty missProp;   /**< ::CUaccessProperty set for miss. Must be either NORMAL or STREAMING. */
+};
+
+#ifdef _WIN32
+#define CUDART_CB __stdcall
+#else
+#define CUDART_CB
+#endif
+
+/**
+ * CUDA host function
+ * \param userData Argument value passed to the function
+ */
+typedef void (CUDART_CB *cudaHostFn_t)(void *userData);
+
+/**
+ * CUDA host node parameters
+ */
+struct __device_builtin__ cudaHostNodeParams {
+    cudaHostFn_t fn;    /**< The function to call when the node executes */
+    void* userData; /**< Argument to pass to the function */
+};
+
+/**
+ * Possible stream capture statuses returned by ::cudaStreamIsCapturing
+ */
+enum __device_builtin__ cudaStreamCaptureStatus {
+    cudaStreamCaptureStatusNone        = 0, /**< Stream is not capturing */
+    cudaStreamCaptureStatusActive      = 1, /**< Stream is actively capturing */
+    cudaStreamCaptureStatusInvalidated = 2  /**< Stream is part of a capture sequence that
+                                                   has been invalidated, but not terminated */
+};
+
+/**
+ * Possible modes for stream capture thread interactions. For more details see
+ * ::cudaStreamBeginCapture and ::cudaThreadExchangeStreamCaptureMode
+ */
+enum __device_builtin__ cudaStreamCaptureMode {
+    cudaStreamCaptureModeGlobal      = 0,
+    cudaStreamCaptureModeThreadLocal = 1,
+    cudaStreamCaptureModeRelaxed     = 2
+};
+
+enum __device_builtin__ cudaSynchronizationPolicy {
+    cudaSyncPolicyAuto = 1,
+    cudaSyncPolicySpin = 2,
+    cudaSyncPolicyYield = 3,
+    cudaSyncPolicyBlockingSync = 4
+};
+
+/**
+ * Cluster scheduling policies. These may be passed to ::cudaFuncSetAttribute
+ */
+enum __device_builtin__ cudaClusterSchedulingPolicy {
+    cudaClusterSchedulingPolicyDefault       = 0, /**< the default policy */
+    cudaClusterSchedulingPolicySpread        = 1, /**< spread the blocks within a cluster to the SMs */
+    cudaClusterSchedulingPolicyLoadBalancing = 2  /**< allow the hardware to load-balance the blocks in a cluster to the SMs */
+};
+
+/**
+ * Flags for ::cudaStreamUpdateCaptureDependencies
+ */
+enum __device_builtin__ cudaStreamUpdateCaptureDependenciesFlags {
+    cudaStreamAddCaptureDependencies = 0x0, /**< Add new nodes to the dependency set */
+    cudaStreamSetCaptureDependencies = 0x1  /**< Replace the dependency set with the new nodes */
+};
+
+/**
+ * Flags for user objects for graphs
+ */
+enum __device_builtin__ cudaUserObjectFlags {
+    cudaUserObjectNoDestructorSync = 0x1  /**< Indicates the destructor execution is not synchronized by any CUDA handle. */
+};
+
+/**
+ * Flags for retaining user object references for graphs
+ */
+enum __device_builtin__ cudaUserObjectRetainFlags {
+    cudaGraphUserObjectMove = 0x1  /**< Transfer references from the caller rather than creating new references. */
+};
+
+/**
+ * CUDA graphics interop resource
+ */
+struct cudaGraphicsResource;
+
+/**
+ * CUDA graphics interop register flags
+ */
+enum __device_builtin__ cudaGraphicsRegisterFlags
+{
+    cudaGraphicsRegisterFlagsNone             = 0,  /**< Default */
+    cudaGraphicsRegisterFlagsReadOnly         = 1,  /**< CUDA will not write to this resource */ 
+    cudaGraphicsRegisterFlagsWriteDiscard     = 2,  /**< CUDA will only write to and will not read from this resource */
+    cudaGraphicsRegisterFlagsSurfaceLoadStore = 4,  /**< CUDA will bind this resource to a surface reference */
+    cudaGraphicsRegisterFlagsTextureGather    = 8   /**< CUDA will perform texture gather operations on this resource */
+};
+
+/**
+ * CUDA graphics interop map flags
+ */
+enum __device_builtin__ cudaGraphicsMapFlags
+{
+    cudaGraphicsMapFlagsNone         = 0,  /**< Default; Assume resource can be read/written */
+    cudaGraphicsMapFlagsReadOnly     = 1,  /**< CUDA will not write to this resource */
+    cudaGraphicsMapFlagsWriteDiscard = 2   /**< CUDA will only write to and will not read from this resource */
+};
+
+/**
+ * CUDA graphics interop array indices for cube maps
+ */
+enum __device_builtin__ cudaGraphicsCubeFace 
+{
+    cudaGraphicsCubeFacePositiveX = 0x00, /**< Positive X face of cubemap */
+    cudaGraphicsCubeFaceNegativeX = 0x01, /**< Negative X face of cubemap */
+    cudaGraphicsCubeFacePositiveY = 0x02, /**< Positive Y face of cubemap */
+    cudaGraphicsCubeFaceNegativeY = 0x03, /**< Negative Y face of cubemap */
+    cudaGraphicsCubeFacePositiveZ = 0x04, /**< Positive Z face of cubemap */
+    cudaGraphicsCubeFaceNegativeZ = 0x05  /**< Negative Z face of cubemap */
+};
+
+/**
+ * CUDA resource types
+ */
+enum __device_builtin__ cudaResourceType
+{
+    cudaResourceTypeArray          = 0x00, /**< Array resource */
+    cudaResourceTypeMipmappedArray = 0x01, /**< Mipmapped array resource */
+    cudaResourceTypeLinear         = 0x02, /**< Linear resource */
+    cudaResourceTypePitch2D        = 0x03  /**< Pitch 2D resource */
+};
+
+/**
+ * CUDA texture resource view formats
+ */
+enum __device_builtin__ cudaResourceViewFormat
+{
+    cudaResViewFormatNone                      = 0x00, /**< No resource view format (use underlying resource format) */
+    cudaResViewFormatUnsignedChar1             = 0x01, /**< 1 channel unsigned 8-bit integers */
+    cudaResViewFormatUnsignedChar2             = 0x02, /**< 2 channel unsigned 8-bit integers */
+    cudaResViewFormatUnsignedChar4             = 0x03, /**< 4 channel unsigned 8-bit integers */
+    cudaResViewFormatSignedChar1               = 0x04, /**< 1 channel signed 8-bit integers */
+    cudaResViewFormatSignedChar2               = 0x05, /**< 2 channel signed 8-bit integers */
+    cudaResViewFormatSignedChar4               = 0x06, /**< 4 channel signed 8-bit integers */
+    cudaResViewFormatUnsignedShort1            = 0x07, /**< 1 channel unsigned 16-bit integers */
+    cudaResViewFormatUnsignedShort2            = 0x08, /**< 2 channel unsigned 16-bit integers */
+    cudaResViewFormatUnsignedShort4            = 0x09, /**< 4 channel unsigned 16-bit integers */
+    cudaResViewFormatSignedShort1              = 0x0a, /**< 1 channel signed 16-bit integers */
+    cudaResViewFormatSignedShort2              = 0x0b, /**< 2 channel signed 16-bit integers */
+    cudaResViewFormatSignedShort4              = 0x0c, /**< 4 channel signed 16-bit integers */
+    cudaResViewFormatUnsignedInt1              = 0x0d, /**< 1 channel unsigned 32-bit integers */
+    cudaResViewFormatUnsignedInt2              = 0x0e, /**< 2 channel unsigned 32-bit integers */
+    cudaResViewFormatUnsignedInt4              = 0x0f, /**< 4 channel unsigned 32-bit integers */
+    cudaResViewFormatSignedInt1                = 0x10, /**< 1 channel signed 32-bit integers */
+    cudaResViewFormatSignedInt2                = 0x11, /**< 2 channel signed 32-bit integers */
+    cudaResViewFormatSignedInt4                = 0x12, /**< 4 channel signed 32-bit integers */
+    cudaResViewFormatHalf1                     = 0x13, /**< 1 channel 16-bit floating point */
+    cudaResViewFormatHalf2                     = 0x14, /**< 2 channel 16-bit floating point */
+    cudaResViewFormatHalf4                     = 0x15, /**< 4 channel 16-bit floating point */
+    cudaResViewFormatFloat1                    = 0x16, /**< 1 channel 32-bit floating point */
+    cudaResViewFormatFloat2                    = 0x17, /**< 2 channel 32-bit floating point */
+    cudaResViewFormatFloat4                    = 0x18, /**< 4 channel 32-bit floating point */
+    cudaResViewFormatUnsignedBlockCompressed1  = 0x19, /**< Block compressed 1 */
+    cudaResViewFormatUnsignedBlockCompressed2  = 0x1a, /**< Block compressed 2 */
+    cudaResViewFormatUnsignedBlockCompressed3  = 0x1b, /**< Block compressed 3 */
+    cudaResViewFormatUnsignedBlockCompressed4  = 0x1c, /**< Block compressed 4 unsigned */
+    cudaResViewFormatSignedBlockCompressed4    = 0x1d, /**< Block compressed 4 signed */
+    cudaResViewFormatUnsignedBlockCompressed5  = 0x1e, /**< Block compressed 5 unsigned */
+    cudaResViewFormatSignedBlockCompressed5    = 0x1f, /**< Block compressed 5 signed */
+    cudaResViewFormatUnsignedBlockCompressed6H = 0x20, /**< Block compressed 6 unsigned half-float */
+    cudaResViewFormatSignedBlockCompressed6H   = 0x21, /**< Block compressed 6 signed half-float */
+    cudaResViewFormatUnsignedBlockCompressed7  = 0x22  /**< Block compressed 7 */
+};
+
+/**
+ * CUDA resource descriptor
+ */
+struct __device_builtin__ cudaResourceDesc {
+    enum cudaResourceType resType;             /**< Resource type */
+    
+    union {
+        struct {
+            cudaArray_t array;                 /**< CUDA array */
+        } array;
+        struct {
+            cudaMipmappedArray_t mipmap;       /**< CUDA mipmapped array */
+        } mipmap;
+        struct {
+            void *devPtr;                      /**< Device pointer */
+            struct cudaChannelFormatDesc desc; /**< Channel descriptor */
+            size_t sizeInBytes;                /**< Size in bytes */
+        } linear;
+        struct {
+            void *devPtr;                      /**< Device pointer */
+            struct cudaChannelFormatDesc desc; /**< Channel descriptor */
+            size_t width;                      /**< Width of the array in elements */
+            size_t height;                     /**< Height of the array in elements */
+            size_t pitchInBytes;               /**< Pitch between two rows in bytes */
+        } pitch2D;
+    } res;
+};
+
+/**
+ * CUDA resource view descriptor
+ */
+struct __device_builtin__ cudaResourceViewDesc
+{
+    enum cudaResourceViewFormat format;           /**< Resource view format */
+    size_t                      width;            /**< Width of the resource view */
+    size_t                      height;           /**< Height of the resource view */
+    size_t                      depth;            /**< Depth of the resource view */
+    unsigned int                firstMipmapLevel; /**< First defined mipmap level */
+    unsigned int                lastMipmapLevel;  /**< Last defined mipmap level */
+    unsigned int                firstLayer;       /**< First layer index */
+    unsigned int                lastLayer;        /**< Last layer index */
+};
+
+/**
+ * CUDA pointer attributes
+ */
+struct __device_builtin__ cudaPointerAttributes
+{
+    /**
+     * The type of memory - ::cudaMemoryTypeUnregistered, ::cudaMemoryTypeHost,
+     * ::cudaMemoryTypeDevice or ::cudaMemoryTypeManaged.
+     */
+    enum cudaMemoryType type;
+
+    /** 
+     * The device against which the memory was allocated or registered.
+     * If the memory type is ::cudaMemoryTypeDevice then this identifies 
+     * the device on which the memory referred physically resides.  If
+     * the memory type is ::cudaMemoryTypeHost or::cudaMemoryTypeManaged then
+     * this identifies the device which was current when the memory was allocated
+     * or registered (and if that device is deinitialized then this allocation
+     * will vanish with that device's state).
+     */
+    int device;
+
+    /**
+     * The address which may be dereferenced on the current device to access 
+     * the memory or NULL if no such address exists.
+     */
+    void *devicePointer;
+
+    /**
+     * The address which may be dereferenced on the host to access the
+     * memory or NULL if no such address exists.
+     *
+     * \note CUDA doesn't check if unregistered memory is allocated so this field
+     * may contain invalid pointer if an invalid pointer has been passed to CUDA.
+     */
+    void *hostPointer;
+};
+
+/**
+ * CUDA function attributes
+ */
+struct __device_builtin__ cudaFuncAttributes
+{
+   /**
+    * The size in bytes of statically-allocated shared memory per block
+    * required by this function. This does not include dynamically-allocated
+    * shared memory requested by the user at runtime.
+    */
+   size_t sharedSizeBytes;
+
+   /**
+    * The size in bytes of user-allocated constant memory required by this
+    * function.
+    */
+   size_t constSizeBytes;
+
+   /**
+    * The size in bytes of local memory used by each thread of this function.
+    */
+   size_t localSizeBytes;
+
+   /**
+    * The maximum number of threads per block, beyond which a launch of the
+    * function would fail. This number depends on both the function and the
+    * device on which the function is currently loaded.
+    */
+   int maxThreadsPerBlock;
+
+   /**
+    * The number of registers used by each thread of this function.
+    */
+   int numRegs;
+
+   /**
+    * The PTX virtual architecture version for which the function was
+    * compiled. This value is the major PTX version * 10 + the minor PTX
+    * version, so a PTX version 1.3 function would return the value 13.
+    */
+   int ptxVersion;
+
+   /**
+    * The binary architecture version for which the function was compiled.
+    * This value is the major binary version * 10 + the minor binary version,
+    * so a binary version 1.3 function would return the value 13.
+    */
+   int binaryVersion;
+
+   /**
+    * The attribute to indicate whether the function has been compiled with 
+    * user specified option "-Xptxas --dlcm=ca" set.
+    */
+   int cacheModeCA;
+
+   /**
+    * The maximum size in bytes of dynamic shared memory per block for 
+    * this function. Any launch must have a dynamic shared memory size
+    * smaller than this value.
+    */
+   int maxDynamicSharedSizeBytes;
+
+   /**
+    * On devices where the L1 cache and shared memory use the same hardware resources, 
+    * this sets the shared memory carveout preference, in percent of the maximum shared memory. 
+    * Refer to ::cudaDevAttrMaxSharedMemoryPerMultiprocessor.
+    * This is only a hint, and the driver can choose a different ratio if required to execute the function.
+    * See ::cudaFuncSetAttribute
+    */
+   int preferredShmemCarveout;
+};
+
+/**
+ * CUDA function attributes that can be set using ::cudaFuncSetAttribute
+ */
+enum __device_builtin__ cudaFuncAttribute
+{
+    cudaFuncAttributeMaxDynamicSharedMemorySize = 8, /**< Maximum dynamic shared memory size */
+    cudaFuncAttributePreferredSharedMemoryCarveout = 9, /**< Preferred shared memory-L1 cache split */
+    cudaFuncAttributeClusterDimMustBeSet = 10, /**< Indicator to enforce valid cluster dimension specification on kernel launch */
+    cudaFuncAttributeRequiredClusterWidth = 11, /**< Required cluster width */
+    cudaFuncAttributeRequiredClusterHeight = 12, /**< Required cluster height */
+    cudaFuncAttributeRequiredClusterDepth = 13, /**< Required cluster depth */
+    cudaFuncAttributeNonPortableClusterSizeAllowed = 14, /**< Whether non-portable cluster scheduling policy is supported */
+    cudaFuncAttributeClusterSchedulingPolicyPreference = 15, /**< Required cluster scheduling policy preference */
+    cudaFuncAttributeMax
+};
+
+/**
+ * CUDA function cache configurations
+ */
+enum __device_builtin__ cudaFuncCache
+{
+    cudaFuncCachePreferNone   = 0,    /**< Default function cache configuration, no preference */
+    cudaFuncCachePreferShared = 1,    /**< Prefer larger shared memory and smaller L1 cache  */
+    cudaFuncCachePreferL1     = 2,    /**< Prefer larger L1 cache and smaller shared memory */
+    cudaFuncCachePreferEqual  = 3     /**< Prefer equal size L1 cache and shared memory */
+};
+
+/**
+ * CUDA shared memory configuration
+ */
+
+enum __device_builtin__ cudaSharedMemConfig
+{
+    cudaSharedMemBankSizeDefault   = 0,
+    cudaSharedMemBankSizeFourByte  = 1,
+    cudaSharedMemBankSizeEightByte = 2
+};
+
+/** 
+ * Shared memory carveout configurations. These may be passed to cudaFuncSetAttribute
+ */
+enum __device_builtin__ cudaSharedCarveout {
+    cudaSharedmemCarveoutDefault      = -1,  /**< No preference for shared memory or L1 (default) */
+    cudaSharedmemCarveoutMaxShared    = 100, /**< Prefer maximum available shared memory, minimum L1 cache */
+    cudaSharedmemCarveoutMaxL1        = 0    /**< Prefer maximum available L1 cache, minimum shared memory */
+};
+
+/**
+ * CUDA device compute modes
+ */
+enum __device_builtin__ cudaComputeMode
+{
+    cudaComputeModeDefault          = 0,  /**< Default compute mode (Multiple threads can use ::cudaSetDevice() with this device) */
+    cudaComputeModeExclusive        = 1,  /**< Compute-exclusive-thread mode (Only one thread in one process will be able to use ::cudaSetDevice() with this device) */
+    cudaComputeModeProhibited       = 2,  /**< Compute-prohibited mode (No threads can use ::cudaSetDevice() with this device) */
+    cudaComputeModeExclusiveProcess = 3   /**< Compute-exclusive-process mode (Many threads in one process will be able to use ::cudaSetDevice() with this device) */
+};
+
+/**
+ * CUDA Limits
+ */
+enum __device_builtin__ cudaLimit
+{
+    cudaLimitStackSize                    = 0x00, /**< GPU thread stack size */
+    cudaLimitPrintfFifoSize               = 0x01, /**< GPU printf FIFO size */
+    cudaLimitMallocHeapSize               = 0x02, /**< GPU malloc heap size */
+    cudaLimitDevRuntimeSyncDepth          = 0x03, /**< GPU device runtime synchronize depth */
+    cudaLimitDevRuntimePendingLaunchCount = 0x04, /**< GPU device runtime pending launch count */
+    cudaLimitMaxL2FetchGranularity        = 0x05, /**< A value between 0 and 128 that indicates the maximum fetch granularity of L2 (in Bytes). This is a hint */
+    cudaLimitPersistingL2CacheSize        = 0x06  /**< A size in bytes for L2 persisting lines cache size */
+};
+
+/**
+ * CUDA Memory Advise values
+ */
+enum __device_builtin__ cudaMemoryAdvise
+{
+    cudaMemAdviseSetReadMostly          = 1, /**< Data will mostly be read and only occassionally be written to */
+    cudaMemAdviseUnsetReadMostly        = 2, /**< Undo the effect of ::cudaMemAdviseSetReadMostly */
+    cudaMemAdviseSetPreferredLocation   = 3, /**< Set the preferred location for the data as the specified device */
+    cudaMemAdviseUnsetPreferredLocation = 4, /**< Clear the preferred location for the data */
+    cudaMemAdviseSetAccessedBy          = 5, /**< Data will be accessed by the specified device, so prevent page faults as much as possible */
+    cudaMemAdviseUnsetAccessedBy        = 6  /**< Let the Unified Memory subsystem decide on the page faulting policy for the specified device */
+};
+
+/**
+ * CUDA range attributes
+ */
+enum __device_builtin__ cudaMemRangeAttribute
+{
+    cudaMemRangeAttributeReadMostly           = 1, /**< Whether the range will mostly be read and only occassionally be written to */
+    cudaMemRangeAttributePreferredLocation    = 2, /**< The preferred location of the range */
+    cudaMemRangeAttributeAccessedBy           = 3, /**< Memory range has ::cudaMemAdviseSetAccessedBy set for specified device */
+    cudaMemRangeAttributeLastPrefetchLocation = 4  /**< The last location to which the range was prefetched */
+};
+
+/**
+ * CUDA Profiler Output modes
+ */
+enum __device_builtin__ cudaOutputMode
+{
+    cudaKeyValuePair    = 0x00, /**< Output mode Key-Value pair format. */
+    cudaCSV             = 0x01  /**< Output mode Comma separated values format. */
+};
+
+/**
+ * CUDA GPUDirect RDMA flush writes APIs supported on the device
+ */
+enum __device_builtin__ cudaFlushGPUDirectRDMAWritesOptions {
+    cudaFlushGPUDirectRDMAWritesOptionHost   = 1<<0, /**< ::cudaDeviceFlushGPUDirectRDMAWrites() and its CUDA Driver API counterpart are supported on the device. */
+    cudaFlushGPUDirectRDMAWritesOptionMemOps = 1<<1  /**< The ::CU_STREAM_WAIT_VALUE_FLUSH flag and the ::CU_STREAM_MEM_OP_FLUSH_REMOTE_WRITES MemOp are supported on the CUDA device. */
+};
+
+/**
+ * CUDA GPUDirect RDMA flush writes ordering features of the device
+ */
+enum __device_builtin__ cudaGPUDirectRDMAWritesOrdering {
+    cudaGPUDirectRDMAWritesOrderingNone       = 0,   /**< The device does not natively support ordering of GPUDirect RDMA writes. ::cudaFlushGPUDirectRDMAWrites() can be leveraged if supported. */
+    cudaGPUDirectRDMAWritesOrderingOwner      = 100, /**< Natively, the device can consistently consume GPUDirect RDMA writes, although other CUDA devices may not. */
+    cudaGPUDirectRDMAWritesOrderingAllDevices = 200  /**< Any CUDA device in the system can consistently consume GPUDirect RDMA writes to this device. */
+};
+
+/**
+ * CUDA GPUDirect RDMA flush writes scopes
+ */
+enum __device_builtin__ cudaFlushGPUDirectRDMAWritesScope {
+    cudaFlushGPUDirectRDMAWritesToOwner      = 100, /**< Blocks until remote writes are visible to the CUDA device context owning the data. */
+    cudaFlushGPUDirectRDMAWritesToAllDevices = 200  /**< Blocks until remote writes are visible to all CUDA device contexts. */
+};
+
+/**
+ * CUDA GPUDirect RDMA flush writes targets
+ */
+enum __device_builtin__ cudaFlushGPUDirectRDMAWritesTarget {
+    cudaFlushGPUDirectRDMAWritesTargetCurrentDevice /**< Sets the target for ::cudaDeviceFlushGPUDirectRDMAWrites() to the currently active CUDA device context. */
+};
+
+
+/**
+ * CUDA device attributes
+ */
+enum __device_builtin__ cudaDeviceAttr
+{
+    cudaDevAttrMaxThreadsPerBlock             = 1,  /**< Maximum number of threads per block */
+    cudaDevAttrMaxBlockDimX                   = 2,  /**< Maximum block dimension X */
+    cudaDevAttrMaxBlockDimY                   = 3,  /**< Maximum block dimension Y */
+    cudaDevAttrMaxBlockDimZ                   = 4,  /**< Maximum block dimension Z */
+    cudaDevAttrMaxGridDimX                    = 5,  /**< Maximum grid dimension X */
+    cudaDevAttrMaxGridDimY                    = 6,  /**< Maximum grid dimension Y */
+    cudaDevAttrMaxGridDimZ                    = 7,  /**< Maximum grid dimension Z */
+    cudaDevAttrMaxSharedMemoryPerBlock        = 8,  /**< Maximum shared memory available per block in bytes */
+    cudaDevAttrTotalConstantMemory            = 9,  /**< Memory available on device for __constant__ variables in a CUDA C kernel in bytes */
+    cudaDevAttrWarpSize                       = 10, /**< Warp size in threads */
+    cudaDevAttrMaxPitch                       = 11, /**< Maximum pitch in bytes allowed by memory copies */
+    cudaDevAttrMaxRegistersPerBlock           = 12, /**< Maximum number of 32-bit registers available per block */
+    cudaDevAttrClockRate                      = 13, /**< Peak clock frequency in kilohertz */
+    cudaDevAttrTextureAlignment               = 14, /**< Alignment requirement for textures */
+    cudaDevAttrGpuOverlap                     = 15, /**< Device can possibly copy memory and execute a kernel concurrently */
+    cudaDevAttrMultiProcessorCount            = 16, /**< Number of multiprocessors on device */
+    cudaDevAttrKernelExecTimeout              = 17, /**< Specifies whether there is a run time limit on kernels */
+    cudaDevAttrIntegrated                     = 18, /**< Device is integrated with host memory */
+    cudaDevAttrCanMapHostMemory               = 19, /**< Device can map host memory into CUDA address space */
+    cudaDevAttrComputeMode                    = 20, /**< Compute mode (See ::cudaComputeMode for details) */
+    cudaDevAttrMaxTexture1DWidth              = 21, /**< Maximum 1D texture width */
+    cudaDevAttrMaxTexture2DWidth              = 22, /**< Maximum 2D texture width */
+    cudaDevAttrMaxTexture2DHeight             = 23, /**< Maximum 2D texture height */
+    cudaDevAttrMaxTexture3DWidth              = 24, /**< Maximum 3D texture width */
+    cudaDevAttrMaxTexture3DHeight             = 25, /**< Maximum 3D texture height */
+    cudaDevAttrMaxTexture3DDepth              = 26, /**< Maximum 3D texture depth */
+    cudaDevAttrMaxTexture2DLayeredWidth       = 27, /**< Maximum 2D layered texture width */
+    cudaDevAttrMaxTexture2DLayeredHeight      = 28, /**< Maximum 2D layered texture height */
+    cudaDevAttrMaxTexture2DLayeredLayers      = 29, /**< Maximum layers in a 2D layered texture */
+    cudaDevAttrSurfaceAlignment               = 30, /**< Alignment requirement for surfaces */
+    cudaDevAttrConcurrentKernels              = 31, /**< Device can possibly execute multiple kernels concurrently */
+    cudaDevAttrEccEnabled                     = 32, /**< Device has ECC support enabled */
+    cudaDevAttrPciBusId                       = 33, /**< PCI bus ID of the device */
+    cudaDevAttrPciDeviceId                    = 34, /**< PCI device ID of the device */
+    cudaDevAttrTccDriver                      = 35, /**< Device is using TCC driver model */
+    cudaDevAttrMemoryClockRate                = 36, /**< Peak memory clock frequency in kilohertz */
+    cudaDevAttrGlobalMemoryBusWidth           = 37, /**< Global memory bus width in bits */
+    cudaDevAttrL2CacheSize                    = 38, /**< Size of L2 cache in bytes */
+    cudaDevAttrMaxThreadsPerMultiProcessor    = 39, /**< Maximum resident threads per multiprocessor */
+    cudaDevAttrAsyncEngineCount               = 40, /**< Number of asynchronous engines */
+    cudaDevAttrUnifiedAddressing              = 41, /**< Device shares a unified address space with the host */    
+    cudaDevAttrMaxTexture1DLayeredWidth       = 42, /**< Maximum 1D layered texture width */
+    cudaDevAttrMaxTexture1DLayeredLayers      = 43, /**< Maximum layers in a 1D layered texture */
+    cudaDevAttrMaxTexture2DGatherWidth        = 45, /**< Maximum 2D texture width if cudaArrayTextureGather is set */
+    cudaDevAttrMaxTexture2DGatherHeight       = 46, /**< Maximum 2D texture height if cudaArrayTextureGather is set */
+    cudaDevAttrMaxTexture3DWidthAlt           = 47, /**< Alternate maximum 3D texture width */
+    cudaDevAttrMaxTexture3DHeightAlt          = 48, /**< Alternate maximum 3D texture height */
+    cudaDevAttrMaxTexture3DDepthAlt           = 49, /**< Alternate maximum 3D texture depth */
+    cudaDevAttrPciDomainId                    = 50, /**< PCI domain ID of the device */
+    cudaDevAttrTexturePitchAlignment          = 51, /**< Pitch alignment requirement for textures */
+    cudaDevAttrMaxTextureCubemapWidth         = 52, /**< Maximum cubemap texture width/height */
+    cudaDevAttrMaxTextureCubemapLayeredWidth  = 53, /**< Maximum cubemap layered texture width/height */
+    cudaDevAttrMaxTextureCubemapLayeredLayers = 54, /**< Maximum layers in a cubemap layered texture */
+    cudaDevAttrMaxSurface1DWidth              = 55, /**< Maximum 1D surface width */
+    cudaDevAttrMaxSurface2DWidth              = 56, /**< Maximum 2D surface width */
+    cudaDevAttrMaxSurface2DHeight             = 57, /**< Maximum 2D surface height */
+    cudaDevAttrMaxSurface3DWidth              = 58, /**< Maximum 3D surface width */
+    cudaDevAttrMaxSurface3DHeight             = 59, /**< Maximum 3D surface height */
+    cudaDevAttrMaxSurface3DDepth              = 60, /**< Maximum 3D surface depth */
+    cudaDevAttrMaxSurface1DLayeredWidth       = 61, /**< Maximum 1D layered surface width */
+    cudaDevAttrMaxSurface1DLayeredLayers      = 62, /**< Maximum layers in a 1D layered surface */
+    cudaDevAttrMaxSurface2DLayeredWidth       = 63, /**< Maximum 2D layered surface width */
+    cudaDevAttrMaxSurface2DLayeredHeight      = 64, /**< Maximum 2D layered surface height */
+    cudaDevAttrMaxSurface2DLayeredLayers      = 65, /**< Maximum layers in a 2D layered surface */
+    cudaDevAttrMaxSurfaceCubemapWidth         = 66, /**< Maximum cubemap surface width */
+    cudaDevAttrMaxSurfaceCubemapLayeredWidth  = 67, /**< Maximum cubemap layered surface width */
+    cudaDevAttrMaxSurfaceCubemapLayeredLayers = 68, /**< Maximum layers in a cubemap layered surface */
+    cudaDevAttrMaxTexture1DLinearWidth        = 69, /**< Maximum 1D linear texture width */
+    cudaDevAttrMaxTexture2DLinearWidth        = 70, /**< Maximum 2D linear texture width */
+    cudaDevAttrMaxTexture2DLinearHeight       = 71, /**< Maximum 2D linear texture height */
+    cudaDevAttrMaxTexture2DLinearPitch        = 72, /**< Maximum 2D linear texture pitch in bytes */
+    cudaDevAttrMaxTexture2DMipmappedWidth     = 73, /**< Maximum mipmapped 2D texture width */
+    cudaDevAttrMaxTexture2DMipmappedHeight    = 74, /**< Maximum mipmapped 2D texture height */
+    cudaDevAttrComputeCapabilityMajor         = 75, /**< Major compute capability version number */ 
+    cudaDevAttrComputeCapabilityMinor         = 76, /**< Minor compute capability version number */
+    cudaDevAttrMaxTexture1DMipmappedWidth     = 77, /**< Maximum mipmapped 1D texture width */
+    cudaDevAttrStreamPrioritiesSupported      = 78, /**< Device supports stream priorities */
+    cudaDevAttrGlobalL1CacheSupported         = 79, /**< Device supports caching globals in L1 */
+    cudaDevAttrLocalL1CacheSupported          = 80, /**< Device supports caching locals in L1 */
+    cudaDevAttrMaxSharedMemoryPerMultiprocessor = 81, /**< Maximum shared memory available per multiprocessor in bytes */
+    cudaDevAttrMaxRegistersPerMultiprocessor  = 82, /**< Maximum number of 32-bit registers available per multiprocessor */
+    cudaDevAttrManagedMemory                  = 83, /**< Device can allocate managed memory on this system */
+    cudaDevAttrIsMultiGpuBoard                = 84, /**< Device is on a multi-GPU board */
+    cudaDevAttrMultiGpuBoardGroupID           = 85, /**< Unique identifier for a group of devices on the same multi-GPU board */
+    cudaDevAttrHostNativeAtomicSupported      = 86, /**< Link between the device and the host supports native atomic operations */
+    cudaDevAttrSingleToDoublePrecisionPerfRatio = 87, /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    cudaDevAttrPageableMemoryAccess           = 88, /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    cudaDevAttrConcurrentManagedAccess        = 89, /**< Device can coherently access managed memory concurrently with the CPU */
+    cudaDevAttrComputePreemptionSupported     = 90, /**< Device supports Compute Preemption */
+    cudaDevAttrCanUseHostPointerForRegisteredMem = 91, /**< Device can access host registered memory at the same virtual address as the CPU */
+    cudaDevAttrReserved92                     = 92,
+    cudaDevAttrReserved93                     = 93,
+    cudaDevAttrReserved94                     = 94,
+    cudaDevAttrCooperativeLaunch              = 95, /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel*/
+    cudaDevAttrCooperativeMultiDeviceLaunch   = 96, /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
+    cudaDevAttrMaxSharedMemoryPerBlockOptin   = 97, /**< The maximum optin shared memory per block. This value may vary by chip. See ::cudaFuncSetAttribute */
+    cudaDevAttrCanFlushRemoteWrites           = 98, /**< Device supports flushing of outstanding remote writes. */
+    cudaDevAttrHostRegisterSupported          = 99, /**< Device supports host memory registration via ::cudaHostRegister. */
+    cudaDevAttrPageableMemoryAccessUsesHostPageTables = 100, /**< Device accesses pageable memory via the host's page tables. */
+    cudaDevAttrDirectManagedMemAccessFromHost = 101, /**< Host can directly access managed memory on the device without migration. */
+    cudaDevAttrMaxBlocksPerMultiprocessor     = 106, /**< Maximum number of blocks per multiprocessor */
+    cudaDevAttrMaxPersistingL2CacheSize       = 108, /**< Maximum L2 persisting lines capacity setting in bytes. */
+    cudaDevAttrMaxAccessPolicyWindowSize      = 109, /**< Maximum value of cudaAccessPolicyWindow::num_bytes. */
+    cudaDevAttrReservedSharedMemoryPerBlock   = 111, /**< Shared memory reserved by CUDA driver per block in bytes */
+    cudaDevAttrSparseCudaArraySupported       = 112, /**< Device supports sparse CUDA arrays and sparse CUDA mipmapped arrays */
+    cudaDevAttrHostRegisterReadOnlySupported  = 113,  /**< Device supports using the ::cudaHostRegister flag cudaHostRegisterReadOnly to register memory that must be mapped as read-only to the GPU */
+    cudaDevAttrTimelineSemaphoreInteropSupported = 114,  /**< External timeline semaphore interop is supported on the device */
+    cudaDevAttrMaxTimelineSemaphoreInteropSupported = 114,  /**< Deprecated, External timeline semaphore interop is supported on the device */
+    cudaDevAttrMemoryPoolsSupported           = 115, /**< Device supports using the ::cudaMallocAsync and ::cudaMemPool family of APIs */
+    cudaDevAttrGPUDirectRDMASupported         = 116, /**< Device supports GPUDirect RDMA APIs, like nvidia_p2p_get_pages (see https://docs.nvidia.com/cuda/gpudirect-rdma for more information) */
+    cudaDevAttrGPUDirectRDMAFlushWritesOptions = 117, /**< The returned attribute shall be interpreted as a bitmask, where the individual bits are listed in the ::cudaFlushGPUDirectRDMAWritesOptions enum */
+    cudaDevAttrGPUDirectRDMAWritesOrdering    = 118, /**< GPUDirect RDMA writes to the device do not need to be flushed for consumers within the scope indicated by the returned attribute. See ::cudaGPUDirectRDMAWritesOrdering for the numerical values returned here. */
+    cudaDevAttrMemoryPoolSupportedHandleTypes = 119, /**< Handle types supported with mempool based IPC */
+    cudaDevAttrClusterLaunch                  = 120, /**< Indicates device supports cluster launch */
+    cudaDevAttrDeferredMappingCudaArraySupported = 121, /**< Device supports deferred mapping CUDA arrays and CUDA mipmapped arrays */
+    cudaDevAttrMax
+};
+
+/**
+ * CUDA memory pool attributes
+ */
+enum __device_builtin__ cudaMemPoolAttr
+{
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to use memory asynchronously freed
+     * in another streams as long as a stream ordering dependency
+     * of the allocating stream on the free action exists.
+     * Cuda events and null stream interactions can create the required
+     * stream ordered dependencies. (default enabled)
+     */
+    cudaMemPoolReuseFollowEventDependencies   = 0x1,
+
+    /**
+     * (value type = int)
+     * Allow reuse of already completed frees when there is no dependency
+     * between the free and allocation. (default enabled)
+     */
+    cudaMemPoolReuseAllowOpportunistic        = 0x2,
+
+    /**
+     * (value type = int)
+     * Allow cuMemAllocAsync to insert new stream dependencies
+     * in order to establish the stream ordering required to reuse
+     * a piece of memory released by cuFreeAsync (default enabled).
+     */
+    cudaMemPoolReuseAllowInternalDependencies = 0x3,
+
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of reserved memory in bytes to hold onto before trying
+     * to release memory back to the OS. When more than the release
+     * threshold bytes of memory are held by the memory pool, the
+     * allocator will try to release memory back to the OS on the
+     * next call to stream, event or context synchronize. (default 0)
+     */
+    cudaMemPoolAttrReleaseThreshold           = 0x4,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of backing memory currently allocated for the mempool.
+     */
+    cudaMemPoolAttrReservedMemCurrent         = 0x5,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of backing memory allocated for the mempool since the
+     * last time it was reset. High watermark can only be reset to zero.
+     */
+    cudaMemPoolAttrReservedMemHigh            = 0x6,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory from the pool that is currently in use by the application.
+     */
+    cudaMemPoolAttrUsedMemCurrent             = 0x7,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of the amount of memory from the pool that was in use by the application since
+     * the last time it was reset. High watermark can only be reset to zero.
+     */
+    cudaMemPoolAttrUsedMemHigh                = 0x8
+};
+
+/**
+ * Specifies the type of location 
+ */
+enum __device_builtin__ cudaMemLocationType {
+    cudaMemLocationTypeInvalid = 0,
+    cudaMemLocationTypeDevice = 1  /**< Location is a device location, thus id is a device ordinal */
+};
+
+/**
+ * Specifies a memory location.
+ *
+ * To specify a gpu, set type = ::cudaMemLocationTypeDevice and set id = the gpu's device ordinal.
+ */
+struct __device_builtin__ cudaMemLocation {
+    enum cudaMemLocationType type;  /**< Specifies the location type, which modifies the meaning of id. */
+    int id;                         /**< identifier for a given this location's ::CUmemLocationType. */
+};
+
+/**
+ * Specifies the memory protection flags for mapping.
+ */
+enum __device_builtin__ cudaMemAccessFlags {
+    cudaMemAccessFlagsProtNone      = 0,  /**< Default, make the address range not accessible */
+    cudaMemAccessFlagsProtRead      = 1,  /**< Make the address range read accessible */
+    cudaMemAccessFlagsProtReadWrite = 3   /**< Make the address range read-write accessible */
+};
+
+/**
+ * Memory access descriptor
+ */
+struct __device_builtin__ cudaMemAccessDesc {
+    struct cudaMemLocation  location; /**< Location on which the request is to change it's accessibility */
+    enum cudaMemAccessFlags flags;    /**< ::CUmemProt accessibility flags to set on the request */
+};
+
+/**
+ * Defines the allocation types available
+ */
+enum __device_builtin__ cudaMemAllocationType {
+    cudaMemAllocationTypeInvalid = 0x0,
+    /** This allocation type is 'pinned', i.e. cannot migrate from its current
+      * location while the application is actively using it
+      */
+    cudaMemAllocationTypePinned  = 0x1,
+    cudaMemAllocationTypeMax     = 0x7FFFFFFF 
+};
+
+/**
+ * Flags for specifying particular handle types
+ */
+enum __device_builtin__ cudaMemAllocationHandleType {
+    cudaMemHandleTypeNone                    = 0x0,  /**< Does not allow any export mechanism. > */
+    cudaMemHandleTypePosixFileDescriptor     = 0x1,  /**< Allows a file descriptor to be used for exporting. Permitted only on POSIX systems. (int) */
+    cudaMemHandleTypeWin32                   = 0x2,  /**< Allows a Win32 NT handle to be used for exporting. (HANDLE) */
+    cudaMemHandleTypeWin32Kmt                = 0x4   /**< Allows a Win32 KMT handle to be used for exporting. (D3DKMT_HANDLE) */
+};
+
+/**
+ * Specifies the properties of allocations made from the pool.
+ */
+struct __device_builtin__ cudaMemPoolProps {
+    enum cudaMemAllocationType         allocType;   /**< Allocation type. Currently must be specified as cudaMemAllocationTypePinned */
+    enum cudaMemAllocationHandleType   handleTypes; /**< Handle types that will be supported by allocations from the pool. */
+    struct cudaMemLocation             location;    /**< Location allocations should reside. */
+    /**
+     * Windows-specific LPSECURITYATTRIBUTES required when
+     * ::cudaMemHandleTypeWin32 is specified.  This security attribute defines
+     * the scope of which exported allocations may be tranferred to other
+     * processes.  In all other cases, this field is required to be zero.
+     */
+    void                              *win32SecurityAttributes;
+    unsigned char                      reserved[64]; /**< reserved for future use, must be 0 */
+};
+
+/**
+ * Opaque data for exporting a pool allocation
+ */
+struct __device_builtin__ cudaMemPoolPtrExportData {
+    unsigned char reserved[64];
+};
+
+/**
+ * Memory allocation node parameters
+ */
+struct __device_builtin__ cudaMemAllocNodeParams {
+    /**
+    * in: location where the allocation should reside (specified in ::location).
+    * ::handleTypes must be ::cudaMemHandleTypeNone. IPC is not supported.
+    */
+    struct cudaMemPoolProps         poolProps;       /**< in: array of memory access descriptors. Used to describe peer GPU access */
+    const struct cudaMemAccessDesc *accessDescs;     /**< in: number of memory access descriptors.  Must not exceed the number of GPUs. */
+    size_t                          accessDescCount; /**< in: Number of `accessDescs`s */
+    size_t                          bytesize;        /**< in: size in bytes of the requested allocation */
+    void                           *dptr;            /**< out: address of the allocation returned by CUDA */
+};
+
+/**
+ * Graph memory attributes
+ */
+enum __device_builtin__ cudaGraphMemAttributeType {
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently associated with graphs.
+     */
+    cudaGraphMemAttrUsedMemCurrent      = 0x0,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, associated with graphs since the
+     * last time it was reset.  High watermark can only be reset to zero.
+     */
+    cudaGraphMemAttrUsedMemHigh         = 0x1,
+
+    /**
+     * (value type = cuuint64_t)
+     * Amount of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    cudaGraphMemAttrReservedMemCurrent  = 0x2,
+
+    /**
+     * (value type = cuuint64_t)
+     * High watermark of memory, in bytes, currently allocated for use by
+     * the CUDA graphs asynchronous allocator.
+     */
+    cudaGraphMemAttrReservedMemHigh     = 0x3
+};
+
+/**
+ * CUDA device P2P attributes
+ */
+
+enum __device_builtin__ cudaDeviceP2PAttr {
+    cudaDevP2PAttrPerformanceRank              = 1, /**< A relative value indicating the performance of the link between two devices */
+    cudaDevP2PAttrAccessSupported              = 2, /**< Peer access is enabled */
+    cudaDevP2PAttrNativeAtomicSupported        = 3, /**< Native atomic operation over the link supported */
+    cudaDevP2PAttrCudaArrayAccessSupported     = 4  /**< Accessing CUDA arrays over the link supported */
+};
+
+/**
+ * CUDA UUID types
+ */
+#ifndef CU_UUID_HAS_BEEN_DEFINED
+#define CU_UUID_HAS_BEEN_DEFINED
+struct __device_builtin__ CUuuid_st {     /**< CUDA definition of UUID */
+    char bytes[16];
+};
+typedef __device_builtin__ struct CUuuid_st CUuuid;
+#endif
+typedef __device_builtin__ struct CUuuid_st cudaUUID_t;
+
+/**
+ * CUDA device properties
+ */
+struct __device_builtin__ cudaDeviceProp
+{
+    char         name[256];                  /**< ASCII string identifying device */
+    cudaUUID_t   uuid;                       /**< 16-byte unique identifier */
+    char         luid[8];                    /**< 8-byte locally unique identifier. Value is undefined on TCC and non-Windows platforms */
+    unsigned int luidDeviceNodeMask;         /**< LUID device node mask. Value is undefined on TCC and non-Windows platforms */
+    size_t       totalGlobalMem;             /**< Global memory available on device in bytes */
+    size_t       sharedMemPerBlock;          /**< Shared memory available per block in bytes */
+    int          regsPerBlock;               /**< 32-bit registers available per block */
+    int          warpSize;                   /**< Warp size in threads */
+    size_t       memPitch;                   /**< Maximum pitch in bytes allowed by memory copies */
+    int          maxThreadsPerBlock;         /**< Maximum number of threads per block */
+    int          maxThreadsDim[3];           /**< Maximum size of each dimension of a block */
+    int          maxGridSize[3];             /**< Maximum size of each dimension of a grid */
+    int          clockRate;                  /**< Clock frequency in kilohertz */
+    size_t       totalConstMem;              /**< Constant memory available on device in bytes */
+    int          major;                      /**< Major compute capability */
+    int          minor;                      /**< Minor compute capability */
+    size_t       textureAlignment;           /**< Alignment requirement for textures */
+    size_t       texturePitchAlignment;      /**< Pitch alignment requirement for texture references bound to pitched memory */
+    int          deviceOverlap;              /**< Device can concurrently copy memory and execute a kernel. Deprecated. Use instead asyncEngineCount. */
+    int          multiProcessorCount;        /**< Number of multiprocessors on device */
+    int          kernelExecTimeoutEnabled;   /**< Specified whether there is a run time limit on kernels */
+    int          integrated;                 /**< Device is integrated as opposed to discrete */
+    int          canMapHostMemory;           /**< Device can map host memory with cudaHostAlloc/cudaHostGetDevicePointer */
+    int          computeMode;                /**< Compute mode (See ::cudaComputeMode) */
+    int          maxTexture1D;               /**< Maximum 1D texture size */
+    int          maxTexture1DMipmap;         /**< Maximum 1D mipmapped texture size */
+    int          maxTexture1DLinear;         /**< Deprecated, do not use. Use cudaDeviceGetTexture1DLinearMaxWidth() or cuDeviceGetTexture1DLinearMaxWidth() instead. */
+    int          maxTexture2D[2];            /**< Maximum 2D texture dimensions */
+    int          maxTexture2DMipmap[2];      /**< Maximum 2D mipmapped texture dimensions */
+    int          maxTexture2DLinear[3];      /**< Maximum dimensions (width, height, pitch) for 2D textures bound to pitched memory */
+    int          maxTexture2DGather[2];      /**< Maximum 2D texture dimensions if texture gather operations have to be performed */
+    int          maxTexture3D[3];            /**< Maximum 3D texture dimensions */
+    int          maxTexture3DAlt[3];         /**< Maximum alternate 3D texture dimensions */
+    int          maxTextureCubemap;          /**< Maximum Cubemap texture dimensions */
+    int          maxTexture1DLayered[2];     /**< Maximum 1D layered texture dimensions */
+    int          maxTexture2DLayered[3];     /**< Maximum 2D layered texture dimensions */
+    int          maxTextureCubemapLayered[2];/**< Maximum Cubemap layered texture dimensions */
+    int          maxSurface1D;               /**< Maximum 1D surface size */
+    int          maxSurface2D[2];            /**< Maximum 2D surface dimensions */
+    int          maxSurface3D[3];            /**< Maximum 3D surface dimensions */
+    int          maxSurface1DLayered[2];     /**< Maximum 1D layered surface dimensions */
+    int          maxSurface2DLayered[3];     /**< Maximum 2D layered surface dimensions */
+    int          maxSurfaceCubemap;          /**< Maximum Cubemap surface dimensions */
+    int          maxSurfaceCubemapLayered[2];/**< Maximum Cubemap layered surface dimensions */
+    size_t       surfaceAlignment;           /**< Alignment requirements for surfaces */
+    int          concurrentKernels;          /**< Device can possibly execute multiple kernels concurrently */
+    int          ECCEnabled;                 /**< Device has ECC support enabled */
+    int          pciBusID;                   /**< PCI bus ID of the device */
+    int          pciDeviceID;                /**< PCI device ID of the device */
+    int          pciDomainID;                /**< PCI domain ID of the device */
+    int          tccDriver;                  /**< 1 if device is a Tesla device using TCC driver, 0 otherwise */
+    int          asyncEngineCount;           /**< Number of asynchronous engines */
+    int          unifiedAddressing;          /**< Device shares a unified address space with the host */
+    int          memoryClockRate;            /**< Peak memory clock frequency in kilohertz */
+    int          memoryBusWidth;             /**< Global memory bus width in bits */
+    int          l2CacheSize;                /**< Size of L2 cache in bytes */
+    int          persistingL2CacheMaxSize;   /**< Device's maximum l2 persisting lines capacity setting in bytes */
+    int          maxThreadsPerMultiProcessor;/**< Maximum resident threads per multiprocessor */
+    int          streamPrioritiesSupported;  /**< Device supports stream priorities */
+    int          globalL1CacheSupported;     /**< Device supports caching globals in L1 */
+    int          localL1CacheSupported;      /**< Device supports caching locals in L1 */
+    size_t       sharedMemPerMultiprocessor; /**< Shared memory available per multiprocessor in bytes */
+    int          regsPerMultiprocessor;      /**< 32-bit registers available per multiprocessor */
+    int          managedMemory;              /**< Device supports allocating managed memory on this system */
+    int          isMultiGpuBoard;            /**< Device is on a multi-GPU board */
+    int          multiGpuBoardGroupID;       /**< Unique identifier for a group of devices on the same multi-GPU board */
+    int          hostNativeAtomicSupported;  /**< Link between the device and the host supports native atomic operations */
+    int          singleToDoublePrecisionPerfRatio; /**< Ratio of single precision performance (in floating-point operations per second) to double precision performance */
+    int          pageableMemoryAccess;       /**< Device supports coherently accessing pageable memory without calling cudaHostRegister on it */
+    int          concurrentManagedAccess;    /**< Device can coherently access managed memory concurrently with the CPU */
+    int          computePreemptionSupported; /**< Device supports Compute Preemption */
+    int          canUseHostPointerForRegisteredMem; /**< Device can access host registered memory at the same virtual address as the CPU */
+    int          cooperativeLaunch;          /**< Device supports launching cooperative kernels via ::cudaLaunchCooperativeKernel */
+    int          cooperativeMultiDeviceLaunch; /**< Deprecated, cudaLaunchCooperativeKernelMultiDevice is deprecated. */
+    size_t       sharedMemPerBlockOptin;     /**< Per device maximum shared memory per block usable by special opt in */
+    int          pageableMemoryAccessUsesHostPageTables; /**< Device accesses pageable memory via the host's page tables */
+    int          directManagedMemAccessFromHost; /**< Host can directly access managed memory on the device without migration. */
+    int          maxBlocksPerMultiProcessor; /**< Maximum number of resident blocks per multiprocessor */
+    int          accessPolicyMaxWindowSize;  /**< The maximum value of ::cudaAccessPolicyWindow::num_bytes. */
+    size_t       reservedSharedMemPerBlock;  /**< Shared memory reserved by CUDA driver per block in bytes */
+};
+
+#define cudaDevicePropDontCare                                 \
+        {                                                      \
+          {'\0'},    /* char         name[256];               */ \
+          {{0}},     /* cudaUUID_t   uuid;                    */ \
+          {'\0'},    /* char         luid[8];                 */ \
+          0,         /* unsigned int luidDeviceNodeMask       */ \
+          0,         /* size_t       totalGlobalMem;          */ \
+          0,         /* size_t       sharedMemPerBlock;       */ \
+          0,         /* int          regsPerBlock;            */ \
+          0,         /* int          warpSize;                */ \
+          0,         /* size_t       memPitch;                */ \
+          0,         /* int          maxThreadsPerBlock;      */ \
+          {0, 0, 0}, /* int          maxThreadsDim[3];        */ \
+          {0, 0, 0}, /* int          maxGridSize[3];          */ \
+          0,         /* int          clockRate;               */ \
+          0,         /* size_t       totalConstMem;           */ \
+          -1,        /* int          major;                   */ \
+          -1,        /* int          minor;                   */ \
+          0,         /* size_t       textureAlignment;        */ \
+          0,         /* size_t       texturePitchAlignment    */ \
+          -1,        /* int          deviceOverlap;           */ \
+          0,         /* int          multiProcessorCount;     */ \
+          0,         /* int          kernelExecTimeoutEnabled */ \
+          0,         /* int          integrated               */ \
+          0,         /* int          canMapHostMemory         */ \
+          0,         /* int          computeMode              */ \
+          0,         /* int          maxTexture1D             */ \
+          0,         /* int          maxTexture1DMipmap       */ \
+          0,         /* int          maxTexture1DLinear       */ \
+          {0, 0},    /* int          maxTexture2D[2]          */ \
+          {0, 0},    /* int          maxTexture2DMipmap[2]    */ \
+          {0, 0, 0}, /* int          maxTexture2DLinear[3]    */ \
+          {0, 0},    /* int          maxTexture2DGather[2]    */ \
+          {0, 0, 0}, /* int          maxTexture3D[3]          */ \
+          {0, 0, 0}, /* int          maxTexture3DAlt[3]       */ \
+          0,         /* int          maxTextureCubemap        */ \
+          {0, 0},    /* int          maxTexture1DLayered[2]   */ \
+          {0, 0, 0}, /* int          maxTexture2DLayered[3]   */ \
+          {0, 0},    /* int          maxTextureCubemapLayered[2] */ \
+          0,         /* int          maxSurface1D             */ \
+          {0, 0},    /* int          maxSurface2D[2]          */ \
+          {0, 0, 0}, /* int          maxSurface3D[3]          */ \
+          {0, 0},    /* int          maxSurface1DLayered[2]   */ \
+          {0, 0, 0}, /* int          maxSurface2DLayered[3]   */ \
+          0,         /* int          maxSurfaceCubemap        */ \
+          {0, 0},    /* int          maxSurfaceCubemapLayered[2] */ \
+          0,         /* size_t       surfaceAlignment         */ \
+          0,         /* int          concurrentKernels        */ \
+          0,         /* int          ECCEnabled               */ \
+          0,         /* int          pciBusID                 */ \
+          0,         /* int          pciDeviceID              */ \
+          0,         /* int          pciDomainID              */ \
+          0,         /* int          tccDriver                */ \
+          0,         /* int          asyncEngineCount         */ \
+          0,         /* int          unifiedAddressing        */ \
+          0,         /* int          memoryClockRate          */ \
+          0,         /* int          memoryBusWidth           */ \
+          0,         /* int          l2CacheSize              */ \
+          0,         /* int          persistingL2CacheMaxSize   */ \
+          0,         /* int          maxThreadsPerMultiProcessor */ \
+          0,         /* int          streamPrioritiesSupported */ \
+          0,         /* int          globalL1CacheSupported   */ \
+          0,         /* int          localL1CacheSupported    */ \
+          0,         /* size_t       sharedMemPerMultiprocessor; */ \
+          0,         /* int          regsPerMultiprocessor;   */ \
+          0,         /* int          managedMemory            */ \
+          0,         /* int          isMultiGpuBoard          */ \
+          0,         /* int          multiGpuBoardGroupID     */ \
+          0,         /* int          hostNativeAtomicSupported */ \
+          0,         /* int          singleToDoublePrecisionPerfRatio */ \
+          0,         /* int          pageableMemoryAccess     */ \
+          0,         /* int          concurrentManagedAccess  */ \
+          0,         /* int          computePreemptionSupported */ \
+          0,         /* int          canUseHostPointerForRegisteredMem */ \
+          0,         /* int          cooperativeLaunch */ \
+          0,         /* int          cooperativeMultiDeviceLaunch */ \
+          0,         /* size_t       sharedMemPerBlockOptin */ \
+          0,         /* int          pageableMemoryAccessUsesHostPageTables */ \
+          0,         /* int          directManagedMemAccessFromHost */ \
+          0,         /* int          accessPolicyMaxWindowSize */ \
+          0,         /* size_t       reservedSharedMemPerBlock */ \
+        } /**< Empty device properties */
+
+/**
+ * CUDA IPC Handle Size
+ */
+#define CUDA_IPC_HANDLE_SIZE 64
+
+/**
+ * CUDA IPC event handle
+ */
+typedef __device_builtin__ struct __device_builtin__ cudaIpcEventHandle_st
+{
+    char reserved[CUDA_IPC_HANDLE_SIZE];
+}cudaIpcEventHandle_t;
+
+/**
+ * CUDA IPC memory handle
+ */
+typedef __device_builtin__ struct __device_builtin__ cudaIpcMemHandle_st 
+{
+    char reserved[CUDA_IPC_HANDLE_SIZE];
+}cudaIpcMemHandle_t;
+
+/**
+ * External memory handle types
+ */
+enum __device_builtin__ cudaExternalMemoryHandleType {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    cudaExternalMemoryHandleTypeOpaqueFd         = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    cudaExternalMemoryHandleTypeOpaqueWin32      = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    cudaExternalMemoryHandleTypeOpaqueWin32Kmt   = 3,
+    /**
+     * Handle is a D3D12 heap object
+     */
+    cudaExternalMemoryHandleTypeD3D12Heap        = 4,
+    /**
+     * Handle is a D3D12 committed resource
+     */
+    cudaExternalMemoryHandleTypeD3D12Resource    = 5,
+    /**
+    *  Handle is a shared NT handle to a D3D11 resource
+    */
+    cudaExternalMemoryHandleTypeD3D11Resource    = 6,
+    /**
+    *  Handle is a globally shared handle to a D3D11 resource
+    */
+    cudaExternalMemoryHandleTypeD3D11ResourceKmt = 7,
+    /**
+    *  Handle is an NvSciBuf object
+    */
+    cudaExternalMemoryHandleTypeNvSciBuf         = 8
+};
+
+/**
+ * Indicates that the external memory object is a dedicated resource
+ */
+#define cudaExternalMemoryDedicated   0x1
+
+/** When the /p flags parameter of ::cudaExternalSemaphoreSignalParams
+ * contains this flag, it indicates that signaling an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define cudaExternalSemaphoreSignalSkipNvSciBufMemSync     0x01
+
+/** When the /p flags parameter of ::cudaExternalSemaphoreWaitParams
+ * contains this flag, it indicates that waiting an external semaphore object
+ * should skip performing appropriate memory synchronization operations over all
+ * the external memory objects that are imported as ::cudaExternalMemoryHandleTypeNvSciBuf,
+ * which otherwise are performed by default to ensure data coherency with other
+ * importers of the same NvSciBuf memory objects.
+ */
+#define cudaExternalSemaphoreWaitSkipNvSciBufMemSync       0x02
+
+/**
+ * When /p flags of ::cudaDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application need signaler specific NvSciSyncAttr
+ * to be filled by ::cudaDeviceGetNvSciSyncAttributes.
+ */
+#define cudaNvSciSyncAttrSignal       0x1
+
+/**
+ * When /p flags of ::cudaDeviceGetNvSciSyncAttributes is set to this,
+ * it indicates that application need waiter specific NvSciSyncAttr
+ * to be filled by ::cudaDeviceGetNvSciSyncAttributes.
+ */
+#define cudaNvSciSyncAttrWait         0x2
+
+/**
+ * External memory handle descriptor
+ */
+struct __device_builtin__ cudaExternalMemoryHandleDesc {
+    /**
+     * Type of the handle
+     */
+    enum cudaExternalMemoryHandleType type;
+    union {
+        /**
+         * File descriptor referencing the memory object. Valid
+         * when type is
+         * ::cudaExternalMemoryHandleTypeOpaqueFd
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::cudaExternalMemoryHandleTypeOpaqueWin32
+         * - ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt
+         * - ::cudaExternalMemoryHandleTypeD3D12Heap 
+         * - ::cudaExternalMemoryHandleTypeD3D12Resource
+		 * - ::cudaExternalMemoryHandleTypeD3D11Resource
+		 * - ::cudaExternalMemoryHandleTypeD3D11ResourceKmt
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is one of the following: 
+         * ::cudaExternalMemoryHandleTypeOpaqueWin32Kmt
+         * ::cudaExternalMemoryHandleTypeD3D11ResourceKmt
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid memory object.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+        /**
+         * A handle representing NvSciBuf Object. Valid when type
+         * is ::cudaExternalMemoryHandleTypeNvSciBuf
+         */
+        const void *nvSciBufObject;
+    } handle;
+    /**
+     * Size of the memory allocation
+     */
+    unsigned long long size;
+    /**
+     * Flags must either be zero or ::cudaExternalMemoryDedicated
+     */
+    unsigned int flags;
+};
+
+/**
+ * External memory buffer descriptor
+ */
+struct __device_builtin__ cudaExternalMemoryBufferDesc {
+    /**
+     * Offset into the memory object where the buffer's base is
+     */
+    unsigned long long offset;
+    /**
+     * Size of the buffer
+     */
+    unsigned long long size;
+    /**
+     * Flags reserved for future use. Must be zero.
+     */
+    unsigned int flags;
+};
+ 
+/**
+ * External memory mipmap descriptor
+ */
+struct __device_builtin__ cudaExternalMemoryMipmappedArrayDesc {
+    /**
+     * Offset into the memory object where the base level of the
+     * mipmap chain is.
+     */
+    unsigned long long offset;
+    /**
+     * Format of base level of the mipmap chain
+     */
+    struct cudaChannelFormatDesc formatDesc;
+    /**
+     * Dimensions of base level of the mipmap chain
+     */
+    struct cudaExtent extent;
+    /**
+     * Flags associated with CUDA mipmapped arrays.
+     * See ::cudaMallocMipmappedArray
+     */
+    unsigned int flags;
+    /**
+     * Total number of levels in the mipmap chain
+     */
+    unsigned int numLevels;
+};
+ 
+/**
+ * External semaphore handle types
+ */
+enum __device_builtin__ cudaExternalSemaphoreHandleType {
+    /**
+     * Handle is an opaque file descriptor
+     */
+    cudaExternalSemaphoreHandleTypeOpaqueFd       = 1,
+    /**
+     * Handle is an opaque shared NT handle
+     */
+    cudaExternalSemaphoreHandleTypeOpaqueWin32    = 2,
+    /**
+     * Handle is an opaque, globally shared handle
+     */
+    cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt = 3,
+    /**
+     * Handle is a shared NT handle referencing a D3D12 fence object
+     */
+    cudaExternalSemaphoreHandleTypeD3D12Fence     = 4,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 fence object
+     */
+    cudaExternalSemaphoreHandleTypeD3D11Fence     = 5,
+    /**
+     * Opaque handle to NvSciSync Object
+     */
+     cudaExternalSemaphoreHandleTypeNvSciSync     = 6,
+    /**
+     * Handle is a shared NT handle referencing a D3D11 keyed mutex object
+     */
+    cudaExternalSemaphoreHandleTypeKeyedMutex     = 7,
+    /**
+     * Handle is a shared KMT handle referencing a D3D11 keyed mutex object
+     */
+    cudaExternalSemaphoreHandleTypeKeyedMutexKmt  = 8,
+    /**
+     * Handle is an opaque handle file descriptor referencing a timeline semaphore
+     */
+    cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd  = 9,
+    /**
+     * Handle is an opaque handle file descriptor referencing a timeline semaphore
+     */
+    cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32  = 10
+};
+
+/**
+ * External semaphore handle descriptor
+ */
+struct __device_builtin__ cudaExternalSemaphoreHandleDesc {
+    /**
+     * Type of the handle
+     */
+    enum cudaExternalSemaphoreHandleType type;
+    union {
+        /**
+         * File descriptor referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::cudaExternalSemaphoreHandleTypeOpaqueFd
+         * - ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreFd
+         */
+        int fd;
+        /**
+         * Win32 handle referencing the semaphore object. Valid when
+         * type is one of the following:
+         * - ::cudaExternalSemaphoreHandleTypeOpaqueWin32
+         * - ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+         * - ::cudaExternalSemaphoreHandleTypeD3D12Fence
+         * - ::cudaExternalSemaphoreHandleTypeD3D11Fence
+         * - ::cudaExternalSemaphoreHandleTypeKeyedMutex
+         * - ::cudaExternalSemaphoreHandleTypeTimelineSemaphoreWin32
+         * Exactly one of 'handle' and 'name' must be non-NULL. If
+         * type is one of the following:
+         * ::cudaExternalSemaphoreHandleTypeOpaqueWin32Kmt
+         * ::cudaExternalSemaphoreHandleTypeKeyedMutexKmt
+         * then 'name' must be NULL.
+         */
+        struct {
+            /**
+             * Valid NT handle. Must be NULL if 'name' is non-NULL
+             */
+            void *handle;
+            /**
+             * Name of a valid synchronization primitive.
+             * Must be NULL if 'handle' is non-NULL.
+             */
+            const void *name;
+        } win32;
+        /**
+         * Valid NvSciSyncObj. Must be non NULL
+         */
+        const void* nvSciSyncObj;
+    } handle;
+    /**
+     * Flags reserved for the future. Must be zero.
+     */
+    unsigned int flags;
+};
+
+/**
+ * External semaphore signal parameters(deprecated)
+ */
+struct __device_builtin__ cudaExternalSemaphoreSignalParams_v1 {
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be signaled
+             */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType
+             * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /*
+             * Value of key to release the mutex with
+             */
+            unsigned long long key;
+        } keyedMutex;
+    } params;
+    /**
+     * Only when ::cudaExternalSemaphoreSignalParams is used to
+     * signal a ::cudaExternalSemaphore_t of type
+     * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is 
+     * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
+     * that while signaling the ::cudaExternalSemaphore_t, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf.
+     * For all other types of ::cudaExternalSemaphore_t, flags must be zero.
+     */
+    unsigned int flags;
+};
+
+/**
+* External semaphore wait parameters(deprecated)
+*/
+struct __device_builtin__ cudaExternalSemaphoreWaitParams_v1 {
+    struct {
+        /**
+        * Parameters for fence objects
+        */
+        struct {
+            /**
+            * Value of fence to be waited on
+            */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType
+             * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to acquire the mutex with
+             */
+            unsigned long long key;
+            /**
+             * Timeout in milliseconds to wait to acquire the mutex
+             */
+            unsigned int timeoutMs;
+        } keyedMutex;
+    } params;
+    /**
+     * Only when ::cudaExternalSemaphoreSignalParams is used to
+     * signal a ::cudaExternalSemaphore_t of type
+     * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is 
+     * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
+     * that while waiting for the ::cudaExternalSemaphore_t, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf.
+     * For all other types of ::cudaExternalSemaphore_t, flags must be zero.
+     */
+    unsigned int flags;
+};
+
+/**
+ * External semaphore signal parameters, compatible with driver type
+ */
+struct __device_builtin__ cudaExternalSemaphoreSignalParams{
+    struct {
+        /**
+         * Parameters for fence objects
+         */
+        struct {
+            /**
+             * Value of fence to be signaled
+             */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType
+             * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /*
+             * Value of key to release the mutex with
+             */
+            unsigned long long key;
+        } keyedMutex;
+        unsigned int reserved[12];
+    } params;
+    /**
+     * Only when ::cudaExternalSemaphoreSignalParams is used to
+     * signal a ::cudaExternalSemaphore_t of type
+     * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is 
+     * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
+     * that while signaling the ::cudaExternalSemaphore_t, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf.
+     * For all other types of ::cudaExternalSemaphore_t, flags must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+};
+
+/**
+ * External semaphore wait parameters, compatible with driver type
+ */
+struct __device_builtin__ cudaExternalSemaphoreWaitParams {
+    struct {
+        /**
+        * Parameters for fence objects
+        */
+        struct {
+            /**
+            * Value of fence to be waited on
+            */
+            unsigned long long value;
+        } fence;
+        union {
+            /**
+             * Pointer to NvSciSyncFence. Valid if ::cudaExternalSemaphoreHandleType
+             * is of type ::cudaExternalSemaphoreHandleTypeNvSciSync.
+             */
+            void *fence;
+            unsigned long long reserved;
+        } nvSciSync;
+        /**
+         * Parameters for keyed mutex objects
+         */
+        struct {
+            /**
+             * Value of key to acquire the mutex with
+             */
+            unsigned long long key;
+            /**
+             * Timeout in milliseconds to wait to acquire the mutex
+             */
+            unsigned int timeoutMs;
+        } keyedMutex;
+        unsigned int reserved[10];
+    } params;
+    /**
+     * Only when ::cudaExternalSemaphoreSignalParams is used to
+     * signal a ::cudaExternalSemaphore_t of type
+     * ::cudaExternalSemaphoreHandleTypeNvSciSync, the valid flag is 
+     * ::cudaExternalSemaphoreSignalSkipNvSciBufMemSync: which indicates
+     * that while waiting for the ::cudaExternalSemaphore_t, no memory
+     * synchronization operations should be performed for any external memory
+     * object imported as ::cudaExternalMemoryHandleTypeNvSciBuf.
+     * For all other types of ::cudaExternalSemaphore_t, flags must be zero.
+     */
+    unsigned int flags;
+    unsigned int reserved[16];
+};
+
+/*******************************************************************************
+*                                                                              *
+*  SHORTHAND TYPE DEFINITION USED BY RUNTIME API                               *
+*                                                                              *
+*******************************************************************************/
+
+/**
+ * CUDA Error types
+ */
+typedef __device_builtin__ enum cudaError cudaError_t;
+
+/**
+ * CUDA stream
+ */
+typedef __device_builtin__ struct CUstream_st *cudaStream_t;
+
+/**
+ * CUDA event types
+ */
+typedef __device_builtin__ struct CUevent_st *cudaEvent_t;
+
+/**
+ * CUDA graphics resource types
+ */
+typedef __device_builtin__ struct cudaGraphicsResource *cudaGraphicsResource_t;
+
+/**
+ * CUDA output file modes
+ */
+typedef __device_builtin__ enum cudaOutputMode cudaOutputMode_t;
+
+/**
+ * CUDA external memory
+ */
+typedef __device_builtin__ struct CUexternalMemory_st *cudaExternalMemory_t;
+
+/**
+ * CUDA external semaphore
+ */
+typedef __device_builtin__ struct CUexternalSemaphore_st *cudaExternalSemaphore_t;
+
+/**
+ * CUDA graph
+ */
+typedef __device_builtin__ struct CUgraph_st *cudaGraph_t;
+
+/**
+ * CUDA graph node.
+ */
+typedef __device_builtin__ struct CUgraphNode_st *cudaGraphNode_t;
+
+/**
+ * CUDA user object for graphs
+ */
+typedef __device_builtin__ struct CUuserObject_st *cudaUserObject_t;
+
+/**
+ * CUDA function
+ */
+typedef __device_builtin__ struct CUfunc_st *cudaFunction_t;
+
+/**
+ * CUDA memory pool
+ */
+typedef __device_builtin__ struct CUmemPoolHandle_st *cudaMemPool_t;
+
+/**
+ * CUDA cooperative group scope
+ */
+enum __device_builtin__ cudaCGScope {
+    cudaCGScopeInvalid   = 0, /**< Invalid cooperative group scope */
+    cudaCGScopeGrid      = 1, /**< Scope represented by a grid_group */
+    cudaCGScopeMultiGrid = 2  /**< Scope represented by a multi_grid_group */
+};
+
+/**
+ * CUDA launch parameters
+ */
+struct __device_builtin__ cudaLaunchParams
+{
+    void *func;          /**< Device function symbol */
+    dim3 gridDim;        /**< Grid dimentions */
+    dim3 blockDim;       /**< Block dimentions */
+    void **args;         /**< Arguments */
+    size_t sharedMem;    /**< Shared memory */
+    cudaStream_t stream; /**< Stream identifier */
+};
+
+/**
+ * CUDA GPU kernel node parameters
+ */
+struct __device_builtin__ cudaKernelNodeParams {
+    void* func;                     /**< Kernel to launch */
+    dim3 gridDim;                   /**< Grid dimensions */
+    dim3 blockDim;                  /**< Block dimensions */
+    unsigned int sharedMemBytes;    /**< Dynamic shared-memory size per thread block in bytes */
+    void **kernelParams;            /**< Array of pointers to individual kernel arguments*/
+    void **extra;                   /**< Pointer to kernel arguments in the "extra" format */
+};
+
+/**
+ * External semaphore signal node parameters
+ */
+struct __device_builtin__ cudaExternalSemaphoreSignalNodeParams {
+    cudaExternalSemaphore_t* extSemArray;                        /**< Array of external semaphore handles. */
+    const struct cudaExternalSemaphoreSignalParams* paramsArray; /**< Array of external semaphore signal parameters. */
+    unsigned int numExtSems;                                     /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+};
+
+/**
+ * External semaphore wait node parameters
+ */
+struct __device_builtin__ cudaExternalSemaphoreWaitNodeParams {
+    cudaExternalSemaphore_t* extSemArray;                      /**< Array of external semaphore handles. */
+    const struct cudaExternalSemaphoreWaitParams* paramsArray; /**< Array of external semaphore wait parameters. */
+    unsigned int numExtSems;                                   /**< Number of handles and parameters supplied in extSemArray and paramsArray. */
+};
+
+/**
+* CUDA Graph node types
+*/
+enum __device_builtin__ cudaGraphNodeType {
+    cudaGraphNodeTypeKernel      = 0x00, /**< GPU kernel node */
+    cudaGraphNodeTypeMemcpy      = 0x01, /**< Memcpy node */
+    cudaGraphNodeTypeMemset      = 0x02, /**< Memset node */
+    cudaGraphNodeTypeHost        = 0x03, /**< Host (executable) node */
+    cudaGraphNodeTypeGraph       = 0x04, /**< Node which executes an embedded graph */
+    cudaGraphNodeTypeEmpty       = 0x05, /**< Empty (no-op) node */
+    cudaGraphNodeTypeWaitEvent   = 0x06, /**< External event wait node */
+    cudaGraphNodeTypeEventRecord = 0x07, /**< External event record node */
+    cudaGraphNodeTypeExtSemaphoreSignal = 0x08, /**< External semaphore signal node */
+    cudaGraphNodeTypeExtSemaphoreWait = 0x09, /**< External semaphore wait node */
+    cudaGraphNodeTypeMemAlloc    = 0x0a, /**< Memory allocation node */
+    cudaGraphNodeTypeMemFree     = 0x0b, /**< Memory free node */
+    cudaGraphNodeTypeCount
+};
+
+/**
+ * CUDA executable (launchable) graph
+ */
+typedef struct CUgraphExec_st* cudaGraphExec_t;
+
+/**
+* CUDA Graph Update error types
+*/
+enum __device_builtin__ cudaGraphExecUpdateResult {
+    cudaGraphExecUpdateSuccess                = 0x0, /**< The update succeeded */
+    cudaGraphExecUpdateError                  = 0x1, /**< The update failed for an unexpected reason which is described in the return value of the function */
+    cudaGraphExecUpdateErrorTopologyChanged   = 0x2, /**< The update failed because the topology changed */
+    cudaGraphExecUpdateErrorNodeTypeChanged   = 0x3, /**< The update failed because a node type changed */
+    cudaGraphExecUpdateErrorFunctionChanged   = 0x4, /**< The update failed because the function of a kernel node changed (CUDA driver < 11.2) */
+    cudaGraphExecUpdateErrorParametersChanged = 0x5, /**< The update failed because the parameters changed in a way that is not supported */
+    cudaGraphExecUpdateErrorNotSupported      = 0x6, /**< The update failed because something about the node is not supported */
+    cudaGraphExecUpdateErrorUnsupportedFunctionChange = 0x7, /**< The update failed because the function of a kernel node changed in an unsupported way */
+    cudaGraphExecUpdateErrorAttributesChanged = 0x8 /**< The update failed because the node attributes changed in a way that is not supported */
+};
+
+/**
+ * Flags to specify search options to be used with ::cudaGetDriverEntryPoint
+ * For more details see ::cuGetProcAddress
+ */ 
+enum __device_builtin__ cudaGetDriverEntryPointFlags {
+    cudaEnableDefault                = 0x0, /**< Default search mode for driver symbols. */
+    cudaEnableLegacyStream           = 0x1, /**< Search for legacy versions of driver symbols. */
+    cudaEnablePerThreadDefaultStream = 0x2  /**< Search for per-thread versions of driver symbols. */
+};
+
+/**
+ * CUDA Graph debug write options
+ */
+enum __device_builtin__ cudaGraphDebugDotFlags {
+    cudaGraphDebugDotFlagsVerbose                  = 1<<0,  /**< Output all debug data as if every debug flag is enabled */
+    cudaGraphDebugDotFlagsKernelNodeParams         = 1<<2,  /**< Adds cudaKernelNodeParams to output */
+    cudaGraphDebugDotFlagsMemcpyNodeParams         = 1<<3,  /**< Adds cudaMemcpy3DParms to output */
+    cudaGraphDebugDotFlagsMemsetNodeParams         = 1<<4,  /**< Adds cudaMemsetParams to output */
+    cudaGraphDebugDotFlagsHostNodeParams           = 1<<5,  /**< Adds cudaHostNodeParams to output */
+    cudaGraphDebugDotFlagsEventNodeParams          = 1<<6,  /**< Adds cudaEvent_t handle from record and wait nodes to output */
+    cudaGraphDebugDotFlagsExtSemasSignalNodeParams = 1<<7,  /**< Adds cudaExternalSemaphoreSignalNodeParams values to output */
+    cudaGraphDebugDotFlagsExtSemasWaitNodeParams   = 1<<8,  /**< Adds cudaExternalSemaphoreWaitNodeParams to output */
+    cudaGraphDebugDotFlagsKernelNodeAttributes     = 1<<9,  /**< Adds cudaKernelNodeAttrID values to output */
+    cudaGraphDebugDotFlagsHandles                  = 1<<10  /**< Adds node handles and every kernel function handle to output */
+};
+
+/**
+ * Flags for instantiating a graph
+ */
+enum __device_builtin__ cudaGraphInstantiateFlags {
+    cudaGraphInstantiateFlagAutoFreeOnLaunch = 1 /**< Automatically free memory allocated in a graph before relaunching. */
+  , cudaGraphInstantiateFlagUseNodePriority  = 8 /**< Run the graph using the per-node priority attributes rather than the
+                                                      priority of the stream it is launched into. */
+};
+
+/**
+ * Launch attributes enum; used as id field of ::cudaLaunchAttribute
+ */
+typedef __device_builtin__ enum cudaLaunchAttributeID {
+    cudaLaunchAttributeIgnore                = 0 /**< Ignored entry, for convenient composition */
+  , cudaLaunchAttributeAccessPolicyWindow    = 1 /**< Valid for streams, graph nodes, launches. */
+  , cudaLaunchAttributeCooperative           = 2 /**< Valid for graph nodes, launches. */
+  , cudaLaunchAttributeSynchronizationPolicy = 3 /**< Valid for streams. */
+  , cudaLaunchAttributeClusterDimension                  = 4 /**< Valid for graph nodes, launches. */
+  , cudaLaunchAttributeClusterSchedulingPolicyPreference = 5 /**< Valid for graph nodes, launches. */
+  , cudaLaunchAttributeProgrammaticStreamSerialization   = 6 /**< Valid for launches. Setting
+                                                                  programmaticStreamSerializationAllowed to non-0
+                                                                  signals that the kernel will use programmatic
+                                                                  means to resolve its stream dependency, so that
+                                                                  the CUDA runtime should opportunistically allow
+                                                                  the grid's execution to overlap with the previous
+                                                                  kernel in the stream, if that kernel requests the
+                                                                  overlap. */
+  , cudaLaunchAttributeProgrammaticEvent                 = 7 /**< Valid for launches. Event recorded through this launch
+                                                                  attribute is guaranteed to only trigger after all
+                                                                  block in the associated kernel trigger the event. A
+                                                                  block can trigger the event through PTX
+                                                                  griddepcontrol.launch_dependents. A trigger can also
+                                                                  be inserted at the beginning of each block's execution
+                                                                  if triggerAtBlockStart is set to non-0. Note that
+                                                                  dependents (including the CPU thread calling
+                                                                  cudaEventSynchronize()) are not guaranteed to observe
+                                                                  the release precisely when it is released. For
+                                                                  example, cudaEventSynchronize() may only observe the
+                                                                  event trigger long after the associated kernel has
+                                                                  completed. This recording type is primarily meant for
+                                                                  establishing programmatic dependency between device
+                                                                  tasks. The event supplied must not be an interprocess
+                                                                  or interop event. The event must disable timing
+                                                                  (i.e. created with ::cudaEventDisableTiming flag
+                                                                  set). */
+  , cudaLaunchAttributePriority              = 8 /**< Valid for graph nodes. */
+} cudaLaunchAttributeID;
+
+/**
+ * Launch attributes union; used as value field of ::cudaLaunchAttribute
+ */
+typedef __device_builtin__ union cudaLaunchAttributeValue {
+    char pad[64]; /* Pad to 64 bytes */
+    struct cudaAccessPolicyWindow accessPolicyWindow;
+    int cooperative;
+    enum cudaSynchronizationPolicy syncPolicy;
+    struct {
+        unsigned int x;
+        unsigned int y;
+        unsigned int z;
+    } clusterDim;
+    enum cudaClusterSchedulingPolicy clusterSchedulingPolicyPreference;
+    int programmaticStreamSerializationAllowed;
+    struct {
+        cudaEvent_t event;
+        int flags;
+        int triggerAtBlockStart;
+    } programmaticEvent;
+    int priority;
+} cudaLaunchAttributeValue;
+
+/**
+ * Launch attribute
+ */
+typedef __device_builtin__ struct cudaLaunchAttribute_st {
+    cudaLaunchAttributeID id;
+    char pad[8 - sizeof(cudaLaunchAttributeID)];
+    cudaLaunchAttributeValue val;
+} cudaLaunchAttribute;
+
+/**
+ * CUDA extensible launch configuration
+ */
+typedef __device_builtin__ struct cudaLaunchConfig_st {
+    dim3 gridDim;               /**< Grid dimentions */
+    dim3 blockDim;              /**< Block dimentions */
+    size_t dynamicSmemBytes;    /**< Dynamic shared-memory size per thread block in bytes */
+    cudaStream_t stream;        /**< Stream identifier */
+    cudaLaunchAttribute *attrs; /**< nullable if numAttrs == 0 */
+    unsigned int numAttrs;      /**< Number of attributes populated in attrs */
+} cudaLaunchConfig_t;
+
+/**
+ * Stream Attributes
+ */
+#define cudaStreamAttrID cudaLaunchAttributeID
+#define cudaStreamAttributeAccessPolicyWindow    cudaLaunchAttributeAccessPolicyWindow
+#define cudaStreamAttributeSynchronizationPolicy cudaLaunchAttributeSynchronizationPolicy
+
+/**
+ * Stream attributes union used with ::cudaStreamSetAttribute/::cudaStreamGetAttribute
+ */
+#define cudaStreamAttrValue cudaLaunchAttributeValue
+
+/**
+ * Graph kernel node Attributes
+ */
+#define cudaKernelNodeAttrID cudaLaunchAttributeID
+#define cudaKernelNodeAttributeAccessPolicyWindow cudaLaunchAttributeAccessPolicyWindow
+#define cudaKernelNodeAttributeCooperative        cudaLaunchAttributeCooperative
+#define cudaKernelNodeAttributePriority           cudaLaunchAttributePriority
+#define cudaKernelNodeAttributeClusterDimension                     cudaLaunchAttributeClusterDimension
+#define cudaKernelNodeAttributeClusterSchedulingPolicyPreference    cudaLaunchAttributeClusterSchedulingPolicyPreference
+
+/**
+ * Graph kernel node attributes union, used with ::cudaGraphKernelNodeSetAttribute/::cudaGraphKernelNodeGetAttribute
+ */
+#define cudaKernelNodeAttrValue cudaLaunchAttributeValue
+
+/** @} */
+/** @} */ /* END CUDART_TYPES */
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_DRIVER_TYPES_H__
+#endif
+
+#undef __CUDA_DEPRECATED
+
+#endif /* !__DRIVER_TYPES_H__ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_config.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_config.h
new file mode 100644
index 0000000000000000000000000000000000000000..785bec4e5c0652f9605ccf9341b7f761a85471ab
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/host_config.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("host_config.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "host_config.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
+#endif
+
+#include "crt/host_config.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_HOST_CONFIG_H_WRAPPER__
+#endif
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/library_types.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/library_types.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a7e42c6b89ba4b446d4cf3d52c8bacd74e73b0d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/library_types.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright 1993-2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__LIBRARY_TYPES_H__)
+#define __LIBRARY_TYPES_H__
+
+
+
+typedef enum cudaDataType_t
+{
+    CUDA_R_16F  =  2, /* real as a half */
+    CUDA_C_16F  =  6, /* complex as a pair of half numbers */
+    CUDA_R_16BF = 14, /* real as a nv_bfloat16 */
+    CUDA_C_16BF = 15, /* complex as a pair of nv_bfloat16 numbers */
+    CUDA_R_32F  =  0, /* real as a float */
+    CUDA_C_32F  =  4, /* complex as a pair of float numbers */
+    CUDA_R_64F  =  1, /* real as a double */
+    CUDA_C_64F  =  5, /* complex as a pair of double numbers */
+    CUDA_R_4I   = 16, /* real as a signed 4-bit int */
+    CUDA_C_4I   = 17, /* complex as a pair of signed 4-bit int numbers */
+    CUDA_R_4U   = 18, /* real as a unsigned 4-bit int */
+    CUDA_C_4U   = 19, /* complex as a pair of unsigned 4-bit int numbers */
+    CUDA_R_8I   =  3, /* real as a signed 8-bit int */
+    CUDA_C_8I   =  7, /* complex as a pair of signed 8-bit int numbers */
+    CUDA_R_8U   =  8, /* real as a unsigned 8-bit int */
+    CUDA_C_8U   =  9, /* complex as a pair of unsigned 8-bit int numbers */
+    CUDA_R_16I  = 20, /* real as a signed 16-bit int */
+    CUDA_C_16I  = 21, /* complex as a pair of signed 16-bit int numbers */
+    CUDA_R_16U  = 22, /* real as a unsigned 16-bit int */
+    CUDA_C_16U  = 23, /* complex as a pair of unsigned 16-bit int numbers */
+    CUDA_R_32I  = 10, /* real as a signed 32-bit int */
+    CUDA_C_32I  = 11, /* complex as a pair of signed 32-bit int numbers */
+    CUDA_R_32U  = 12, /* real as a unsigned 32-bit int */
+    CUDA_C_32U  = 13, /* complex as a pair of unsigned 32-bit int numbers */
+    CUDA_R_64I  = 24, /* real as a signed 64-bit int */
+    CUDA_C_64I  = 25, /* complex as a pair of signed 64-bit int numbers */
+    CUDA_R_64U  = 26, /* real as a unsigned 64-bit int */
+    CUDA_C_64U  = 27, /* complex as a pair of unsigned 64-bit int numbers */
+    CUDA_R_8F_E4M3 = 28, /* real as a nv_fp8_e4m3 */
+    CUDA_R_8F_E5M2 = 29, /* real as a nv_fp8_e5m2 */
+} cudaDataType;
+
+
+typedef enum libraryPropertyType_t
+{
+    MAJOR_VERSION,
+    MINOR_VERSION,
+    PATCH_LEVEL
+} libraryPropertyType;
+
+
+#ifndef __cplusplus
+typedef enum cudaDataType_t cudaDataType_t;
+typedef enum libraryPropertyType_t libraryPropertyType_t;
+#endif
+
+#endif /* !__LIBRARY_TYPES_H__ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc806976784e494edc905d8b8bd9ad138054bbea
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/math_functions.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright 1993-2018 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__)
+#if defined(_MSC_VER)
+#pragma message("math_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead.")
+#else
+#warning "math_functions.h is an internal header file and must not be used directly.  This file will be removed in a future CUDA release.  Please use cuda_runtime_api.h or cuda_runtime.h instead."
+#endif
+#define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#define __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
+#endif
+
+#include "crt/math_functions.h"
+
+#if defined(__UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__)
+#undef __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
+#undef __UNDEF_CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS_MATH_FUNCTIONS_H_WRAPPER__
+#endif
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_32_atomic_functions.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_32_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ebe60b8ca83666f07464b06ff04e6fc432c31b7b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_32_atomic_functions.hpp
@@ -0,0 +1,134 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 35.235 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.35.235 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_32_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_32_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* !__CUDACC_RTC__ */
+#define __SM_32_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMin(long long *address, long long val)
+{
+    return __illAtomicMin(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicMax(long long *address, long long val)
+{
+    return __illAtomicMax(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicAnd(long long *address, long long val)
+{
+    return __llAtomicAnd(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicOr(long long *address, long long val)
+{
+    return __llAtomicOr(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ long long atomicXor(long long *address, long long val)
+{
+    return __llAtomicXor(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMin(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicMin(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicMax(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicMax(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicAnd(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicAnd(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicOr(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicOr(address, val);
+}
+
+__SM_32_ATOMIC_FUNCTIONS_DECL__ unsigned long long atomicXor(unsigned long long *address, unsigned long long val)
+{
+    return __ullAtomicXor(address, val);
+}
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 320 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_32_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__SM_32_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_60_atomic_functions.hpp b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_60_atomic_functions.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4d5227023221116868e8446fdac23efb96e94ae
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/sm_60_atomic_functions.hpp
@@ -0,0 +1,527 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SM_60_ATOMIC_FUNCTIONS_HPP__)
+#define __SM_60_ATOMIC_FUNCTIONS_HPP__
+
+#if defined(__CUDACC_RTC__)
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ __device__
+#else /* __CUDACC_RTC__ */
+#define __SM_60_ATOMIC_FUNCTIONS_DECL__ static __inline__ __device__
+#endif /* __CUDACC_RTC__ */
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 600
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__ double atomicAdd(double *address, double val)
+{
+  return __dAtomicAdd(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_block(int *address, int val)
+{
+  return __iAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAdd_system(int *address, int val)
+{
+  return __iAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAdd_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAdd_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_block(float *address, float val)
+{
+  return __fAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicAdd_system(float *address, float val)
+{
+  return __fAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_block(double *address, double val)
+{
+  return __dAtomicAdd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+double atomicAdd_system(double *address, double val)
+{
+  return __dAtomicAdd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_block(int *address, int val)
+{
+  return __iAtomicAdd_block(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicSub_system(int *address, int val)
+{
+  return __iAtomicAdd_system(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_block(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicSub_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAdd_system(address, (unsigned int)-(int)val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_block(int *address, int val)
+{
+  return __iAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicExch_system(int *address, int val)
+{
+  return __iAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicExch_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicExch_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_block(float *address, float val)
+{
+  return __fAtomicExch_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+float atomicExch_system(float *address, float val)
+{
+  return __fAtomicExch_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_block(int *address, int val)
+{
+  return __iAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMin_system(int *address, int val)
+{
+  return __iAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_block(long long *address, long long val)
+{
+  return __illAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMin_system(long long *address, long long val)
+{
+  return __illAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMin_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMin_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMin_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMin_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_block(int *address, int val)
+{
+  return __iAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicMax_system(int *address, int val)
+{
+  return __iAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_block(long long *address, long long val)
+{
+  return __illAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicMax_system(long long *address, long long val)
+{
+  return __illAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicMax_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMax_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicMax_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicMax_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicInc_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicInc_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicDec_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicDec_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_block(int *address, int compare, int val)
+{
+  return __iAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicCAS_system(int *address, int compare, int val)
+{
+  return __iAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_block(unsigned int *address, unsigned int compare,
+                             unsigned int val)
+{
+  return __uAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicCAS_system(unsigned int *address, unsigned int compare,
+                              unsigned int val)
+{
+  return __uAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_block(unsigned long long int *address,
+                                       unsigned long long int compare,
+                                       unsigned long long int val)
+{
+  return __ullAtomicCAS_block(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long int atomicCAS_system(unsigned long long int *address,
+                                        unsigned long long int compare,
+                                        unsigned long long int val)
+{
+  return __ullAtomicCAS_system(address, compare, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_block(int *address, int val)
+{
+  return __iAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicAnd_system(int *address, int val)
+{
+  return __iAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_block(long long *address, long long val)
+{
+  return __llAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicAnd_system(long long *address, long long val)
+{
+  return __llAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicAnd_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAnd_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicAnd_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicAnd_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_block(int *address, int val)
+{
+  return __iAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicOr_system(int *address, int val)
+{
+  return __iAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_block(long long *address, long long val)
+{
+  return __llAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicOr_system(long long *address, long long val)
+{
+  return __llAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicOr_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicOr_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicOr_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicOr_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_block(int *address, int val)
+{
+  return __iAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+int atomicXor_system(int *address, int val)
+{
+  return __iAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_block(long long *address, long long val)
+{
+  return __llAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+long long atomicXor_system(long long *address, long long val)
+{
+  return __llAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_block(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned int atomicXor_system(unsigned int *address, unsigned int val)
+{
+  return __uAtomicXor_system(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_block(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicXor_block(address, val);
+}
+
+__SM_60_ATOMIC_FUNCTIONS_DECL__
+unsigned long long atomicXor_system(unsigned long long *address, unsigned long long val)
+{
+  return __ullAtomicXor_system(address, val);
+}
+
+#endif /* !__CUDA_ARCH__ || __CUDA_ARCH__ >= 600 */
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#undef __SM_60_ATOMIC_FUNCTIONS_DECL__
+
+#endif /* !__SM_60_ATOMIC_FUNCTIONS_HPP__ */
+
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..587a995d0ea8a697b028706e824f9437276401e6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/surface_functions.h
@@ -0,0 +1,439 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__SURFACE_FUNCTIONS_H__)
+#define __SURFACE_FUNCTIONS_H__
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+#include "cuda_surface_types.h"
+
+#if defined(_WIN32)
+# define __DEPRECATED__ __declspec(deprecated)
+#else
+# define __DEPRECATED__  __attribute__((deprecated))
+#endif
+
+
+
+#ifdef __CUDA_ARCH__
+template <typename T> struct __nv_surf_trait {  typedef void * cast_type; };
+
+template<> struct __nv_surf_trait<char> {  typedef char * cast_type; };
+template<> struct __nv_surf_trait<signed char> {  typedef signed char * cast_type; };
+template<> struct __nv_surf_trait<unsigned char> {  typedef unsigned char * cast_type; };
+template<> struct __nv_surf_trait<char1> {  typedef char1 * cast_type; };
+template<> struct __nv_surf_trait<uchar1> {  typedef uchar1 * cast_type; };
+template<> struct __nv_surf_trait<char2> {  typedef char2 * cast_type; };
+template<> struct __nv_surf_trait<uchar2> {  typedef uchar2 * cast_type; };
+template<> struct __nv_surf_trait<char4> {  typedef char4 * cast_type; };
+template<> struct __nv_surf_trait<uchar4> {  typedef uchar4 * cast_type; };
+template<> struct __nv_surf_trait<short> {  typedef short * cast_type; };
+template<> struct __nv_surf_trait<unsigned short> {  typedef unsigned short * cast_type; };
+template<> struct __nv_surf_trait<short1> {  typedef short1 * cast_type; };
+template<> struct __nv_surf_trait<ushort1> {  typedef ushort1 * cast_type; };
+template<> struct __nv_surf_trait<short2> {  typedef short2 * cast_type; };
+template<> struct __nv_surf_trait<ushort2> {  typedef ushort2 * cast_type; };
+template<> struct __nv_surf_trait<short4> {  typedef short4 * cast_type; };
+template<> struct __nv_surf_trait<ushort4> {  typedef ushort4 * cast_type; };
+template<> struct __nv_surf_trait<int> {  typedef int * cast_type; };
+template<> struct __nv_surf_trait<unsigned int> {  typedef unsigned int * cast_type; };
+template<> struct __nv_surf_trait<int1> {  typedef int1 * cast_type; };
+template<> struct __nv_surf_trait<uint1> {  typedef uint1 * cast_type; };
+template<> struct __nv_surf_trait<int2> {  typedef int2 * cast_type; };
+template<> struct __nv_surf_trait<uint2> {  typedef uint2 * cast_type; };
+template<> struct __nv_surf_trait<int4> {  typedef int4 * cast_type; };
+template<> struct __nv_surf_trait<uint4> {  typedef uint4 * cast_type; };
+template<> struct __nv_surf_trait<long long> {  typedef long long * cast_type; };
+template<> struct __nv_surf_trait<unsigned long long> {  typedef unsigned long long * cast_type; };
+template<> struct __nv_surf_trait<longlong1> {  typedef longlong1 * cast_type; };
+template<> struct __nv_surf_trait<ulonglong1> {  typedef ulonglong1 * cast_type; };
+template<> struct __nv_surf_trait<longlong2> {  typedef longlong2 * cast_type; };
+template<> struct __nv_surf_trait<ulonglong2> {  typedef ulonglong2 * cast_type; };
+#if !defined(__LP64__)
+template<> struct __nv_surf_trait<long> {  typedef int * cast_type; };
+template<> struct __nv_surf_trait<unsigned long> {  typedef unsigned int * cast_type; };
+template<> struct __nv_surf_trait<long1> {  typedef int1 * cast_type; };
+template<> struct __nv_surf_trait<ulong1> {  typedef uint1 * cast_type; };
+template<> struct __nv_surf_trait<long2> {  typedef int2 * cast_type; };
+template<> struct __nv_surf_trait<ulong2> {  typedef uint2 * cast_type; };
+template<> struct __nv_surf_trait<long4> {  typedef uint4 * cast_type; };
+template<> struct __nv_surf_trait<ulong4> {  typedef int4 * cast_type; };
+#endif
+template<> struct __nv_surf_trait<float> {  typedef float * cast_type; };
+template<> struct __nv_surf_trait<float1> {  typedef float1 * cast_type; };
+template<> struct __nv_surf_trait<float2> {  typedef float2 * cast_type; };
+template<> struct __nv_surf_trait<float4> {  typedef float4 * cast_type; };
+#endif /* defined(__CUDA_ARCH__) */
+
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf1Dread(T *res, surface<void, cudaSurfaceType1D> surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1Dread_v2", (void *)res, s, surf, x, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf1Dread(surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf1Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, mode);
+  return temp;
+#endif
+}
+  
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1Dread(T *res, surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  *res = surf1Dread<T>(surf, x, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf2Dread(T *res, surface<void, cudaSurfaceType2D> surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2Dread_v2", (void *)res, s, surf, x, y, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf2Dread(surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf2Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, mode);
+  return temp;
+#endif
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2Dread(T *res, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  *res = surf2Dread<T>(surf, x, y, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf3Dread(T *res, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf3Dread_v2", (void *)res, s, surf, x, y, z, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf3Dread(surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf3Dread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, z, mode);
+  return temp;
+#endif
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf3Dread(T *res, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  *res = surf3Dread<T>(surf, x, y, z, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf1DLayeredread(T *res, surface<void, cudaSurfaceType1DLayered> surf, int x, int  layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1DLayeredread_v2", (void *)res, s, surf, x,  layer, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf1DLayeredread(surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf1DLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, layer, mode);
+  return temp;
+#endif
+}
+
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredread(T *res, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  *res = surf1DLayeredread<T>(surf, x, layer, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surf2DLayeredread(T *res, surface<void, cudaSurfaceType2DLayered> surf, int x,  int y, int  layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2DLayeredread_v2", (void *)res, s, surf, x, y, layer, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surf2DLayeredread(surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surf2DLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, layer, mode);
+  return temp;
+#endif
+}
+
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredread(T *res, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  *res = surf2DLayeredread<T>(surf, x, y, layer, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+template <typename T>
+static __device__  __forceinline__ void surfCubemapread(T *res, surface<void, cudaSurfaceTypeCubemap> surf, int x,  int y, int  face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapread_v2", (void *)res, s, surf, x, y, face, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surfCubemapread(surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+
+  __nv_tex_surf_handler("__surfCubemapread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, face, mode);
+  return temp;
+#endif
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapread(T *res, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__  
+  *res = surfCubemapread<T>(surf, x, y, face, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+template <typename T>
+static __DEPRECATED__ __device__  __forceinline__ void surfCubemapLayeredread(T *res, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x,  int y, int  layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapLayeredread_v2", (void *)res, s, surf, x, y, layerFace, mode);
+#endif     
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__  T surfCubemapLayeredread(surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  T temp;
+  __nv_tex_surf_handler("__surfCubemapLayeredread_v2", (typename __nv_surf_trait<T>::cast_type)&temp, (int)sizeof(T), surf, x, y, layerFace, mode);
+  return temp;
+#endif
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredread(T *res, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__  
+  *res = surfCubemapLayeredread<T>(surf, x, y, layerFace, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+//surf1Dwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface<void, cudaSurfaceType1D> surf, int x, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1Dwrite_v2", (void *)&val, s, surf, x, mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1Dwrite(T val, surface<void, cudaSurfaceType1D> surf, int x, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surf1Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x,  mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+//surf2Dwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2Dwrite_v2", (void *)&val,  s, surf, x, y, mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2Dwrite(T val, surface<void, cudaSurfaceType2D> surf, int x, int y, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surf2Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y,  mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+//surf3Dwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf3Dwrite_v2", (void *)&val,  s, surf, x, y, z,mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf3Dwrite(T val, surface<void, cudaSurfaceType3D> surf, int x, int y, int z, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surf3Dwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, z,  mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+//surf1DLayeredwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf1DLayeredwrite_v2", (void *)&val,  s, surf, x, layer,mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf1DLayeredwrite(T val, surface<void, cudaSurfaceType1DLayered> surf, int x, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surf1DLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val,  (int)sizeof(T), surf, x, layer, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+//surf2DLayeredwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surf2DLayeredwrite_v2", (void *)&val, s, surf, x, y, layer,mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surf2DLayeredwrite(T val, surface<void, cudaSurfaceType2DLayered> surf, int x, int y, int layer, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surf2DLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val,  (int)sizeof(T), surf, x, y, layer, mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+//surfCubemapwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapwrite_v2", (void *)&val, s, surf, x, y, face, mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapwrite(T val, surface<void, cudaSurfaceTypeCubemap> surf, int x, int y, int face, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surfCubemapwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, face,  mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+
+//surfCubemapLayeredwrite
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, int s, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__
+  __nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (void *)&val, s, surf, x, y, layerFace,  mode);
+#endif  
+}
+
+template<class T>
+static __DEPRECATED__ __device__ __forceinline__ void surfCubemapLayeredwrite(T val, surface<void, cudaSurfaceTypeCubemapLayered> surf, int x, int y, int layerFace, enum cudaSurfaceBoundaryMode mode = cudaBoundaryModeTrap)
+{
+#ifdef __CUDA_ARCH__ 
+  __nv_tex_surf_handler("__surfCubemapLayeredwrite_v2", (typename __nv_surf_trait<T>::cast_type)&val, (int)sizeof(T), surf, x, y, layerFace,  mode);
+#endif /* __CUDA_ARCH__ */  
+}
+
+#undef __DEPRECATED__
+
+
+#endif /* __cplusplus && __CUDACC__ */
+#endif /* !__SURFACE_FUNCTIONS_H__ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_fetch_functions.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_fetch_functions.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad970aea7a04023822ba01515a10c6e83c4d7def
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/texture_fetch_functions.h
@@ -0,0 +1,739 @@
+/*
+ * Copyright 1993-2017 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(__TEXTURE_FETCH_FUNCTIONS_H__)
+#define __TEXTURE_FETCH_FUNCTIONS_H__
+
+
+#if defined(__cplusplus) && defined(__CUDACC__)
+
+/*******************************************************************************
+*                                                                              *
+*                                                                              *
+*                                                                              *
+*******************************************************************************/
+
+#include "cuda_runtime_api.h"
+#include "cuda_texture_types.h"
+
+#if defined(_WIN32)
+# define __DEPRECATED__ __declspec(deprecated)
+#else
+# define __DEPRECATED__  __attribute__((deprecated))
+#endif
+
+
+template <typename T>
+struct __nv_tex_rmet_ret { };
+
+template<> struct __nv_tex_rmet_ret<char> { typedef char type; };
+template<> struct __nv_tex_rmet_ret<signed char> { typedef signed char type; };
+template<> struct __nv_tex_rmet_ret<unsigned char> { typedef unsigned char type; };
+template<> struct __nv_tex_rmet_ret<char1> { typedef char1 type; };
+template<> struct __nv_tex_rmet_ret<uchar1> { typedef uchar1 type; };
+template<> struct __nv_tex_rmet_ret<char2> { typedef char2 type; };
+template<> struct __nv_tex_rmet_ret<uchar2> { typedef uchar2 type; };
+template<> struct __nv_tex_rmet_ret<char4> { typedef char4 type; };
+template<> struct __nv_tex_rmet_ret<uchar4> { typedef uchar4 type; };
+
+template<> struct __nv_tex_rmet_ret<short> { typedef short type; };
+template<> struct __nv_tex_rmet_ret<unsigned short> { typedef unsigned short type; };
+template<> struct __nv_tex_rmet_ret<short1> { typedef short1 type; };
+template<> struct __nv_tex_rmet_ret<ushort1> { typedef ushort1 type; };
+template<> struct __nv_tex_rmet_ret<short2> { typedef short2 type; };
+template<> struct __nv_tex_rmet_ret<ushort2> { typedef ushort2 type; };
+template<> struct __nv_tex_rmet_ret<short4> { typedef short4 type; };
+template<> struct __nv_tex_rmet_ret<ushort4> { typedef ushort4 type; };
+
+template<> struct __nv_tex_rmet_ret<int> { typedef int type; };
+template<> struct __nv_tex_rmet_ret<unsigned int> { typedef unsigned int type; };
+template<> struct __nv_tex_rmet_ret<int1> { typedef int1 type; };
+template<> struct __nv_tex_rmet_ret<uint1> { typedef uint1 type; };
+template<> struct __nv_tex_rmet_ret<int2> { typedef int2 type; };
+template<> struct __nv_tex_rmet_ret<uint2> { typedef uint2 type; };
+template<> struct __nv_tex_rmet_ret<int4> { typedef int4 type; };
+template<> struct __nv_tex_rmet_ret<uint4> { typedef uint4 type; };
+
+#if !defined(__LP64__)
+template<> struct __nv_tex_rmet_ret<long> { typedef long type; };
+template<> struct __nv_tex_rmet_ret<unsigned long> { typedef unsigned long type; };
+template<> struct __nv_tex_rmet_ret<long1> { typedef long1 type; };
+template<> struct __nv_tex_rmet_ret<ulong1> { typedef ulong1 type; };
+template<> struct __nv_tex_rmet_ret<long2> { typedef long2 type; };
+template<> struct __nv_tex_rmet_ret<ulong2> { typedef ulong2 type; };
+template<> struct __nv_tex_rmet_ret<long4> { typedef long4 type; };
+template<> struct __nv_tex_rmet_ret<ulong4> { typedef ulong4 type; };
+#endif /* !__LP64__ */
+template<> struct __nv_tex_rmet_ret<float> { typedef float type; };
+template<> struct __nv_tex_rmet_ret<float1> { typedef float1 type; };
+template<> struct __nv_tex_rmet_ret<float2> { typedef float2 type; };
+template<> struct __nv_tex_rmet_ret<float4> { typedef float4 type; };
+
+
+template <typename T> struct __nv_tex_rmet_cast { typedef T* type;  };
+#if !defined(__LP64__)
+template<> struct __nv_tex_rmet_cast<long> { typedef int *type; };
+template<> struct __nv_tex_rmet_cast<unsigned long> { typedef unsigned int *type; };
+template<> struct __nv_tex_rmet_cast<long1> { typedef int1 *type; };
+template<> struct __nv_tex_rmet_cast<ulong1> { typedef uint1 *type; };
+template<> struct __nv_tex_rmet_cast<long2> { typedef int2 *type; };
+template<> struct __nv_tex_rmet_cast<ulong2> { typedef uint2 *type; };
+template<> struct __nv_tex_rmet_cast<long4> { typedef int4 *type; };
+template<> struct __nv_tex_rmet_cast<ulong4> { typedef uint4 *type; };
+#endif /* !__LP64__ */
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__  typename __nv_tex_rmet_ret<T>::type tex1Dfetch(texture<T, cudaTextureType1D, cudaReadModeElementType> t, int x)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1Dfetch_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x);
+  return temp;
+#endif
+}
+
+template <typename T>
+struct __nv_tex_rmnf_ret { };
+
+template <> struct __nv_tex_rmnf_ret<char> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<signed char> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<unsigned char> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<short> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<unsigned short> { typedef float type; };
+template <> struct __nv_tex_rmnf_ret<char1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<uchar1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<short1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<ushort1> { typedef float1 type; };
+template <> struct __nv_tex_rmnf_ret<char2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<uchar2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<short2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<ushort2> { typedef float2 type; };
+template <> struct __nv_tex_rmnf_ret<char4> { typedef float4 type; };
+template <> struct __nv_tex_rmnf_ret<uchar4> { typedef float4 type; };
+template <> struct __nv_tex_rmnf_ret<short4> { typedef float4 type; };
+template <> struct __nv_tex_rmnf_ret<ushort4> { typedef float4 type; };
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1Dfetch(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, int x) 
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1Dfetch_rmnf_v2", &type_dummy, &retval, t, x);
+  return retval;
+#endif /* __CUDA_ARCH__ */  
+}
+
+// tex1D
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1D(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1D(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1D_rmnf_v2", &type_dummy, &retval, t, x);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+//tex2D
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2D(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+
+  __nv_tex_surf_handler("__tex2D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2D(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2D_rmnf_v2", &type_dummy, &retval, t, x, y);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+//tex1DLayered
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayered(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DLayered_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, layer);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayered(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DLayered_rmnf_v2", &type_dummy, &retval, t, x, layer);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+//tex2DLayered
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayered(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DLayered_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, layer);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayered(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DLayered_rmnf_v2", &type_dummy, &retval, t, x, y, layer);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex3D
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3D(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex3D_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, z);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3D(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex3D_rmnf_v2", &type_dummy, &retval, t, x, y, z);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// texCubemap
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemap(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemap_v2", (typename __nv_tex_rmet_cast<T>::type) &temp, t, x, y, z);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemap(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemap_rmnf_v2", &type_dummy, &retval, t, x, y, z);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+template <typename T>
+struct __nv_tex2dgather_ret { };
+template <> struct __nv_tex2dgather_ret<char> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<signed char> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char1> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char2> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char3> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<char4> { typedef char4 type; };
+template <> struct __nv_tex2dgather_ret<unsigned char> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar1> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar2> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar3> { typedef uchar4 type; };
+template <> struct __nv_tex2dgather_ret<uchar4> { typedef uchar4 type; };
+
+template <> struct __nv_tex2dgather_ret<short> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short1> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short2> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short3> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<short4> { typedef short4 type; };
+template <> struct __nv_tex2dgather_ret<unsigned short> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort1> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort2> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort3> { typedef ushort4 type; };
+template <> struct __nv_tex2dgather_ret<ushort4> { typedef ushort4 type; };
+
+template <> struct __nv_tex2dgather_ret<int> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int1> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int2> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int3> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<int4> { typedef int4 type; };
+template <> struct __nv_tex2dgather_ret<unsigned int> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint1> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint2> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint3> { typedef uint4 type; };
+template <> struct __nv_tex2dgather_ret<uint4> { typedef uint4 type; };
+
+template <> struct __nv_tex2dgather_ret<float> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float1> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float2> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float3> { typedef float4 type; };
+template <> struct __nv_tex2dgather_ret<float4> { typedef float4 type; };
+
+template <typename T>
+static __device__ __forceinline__ typename __nv_tex2dgather_ret<T>::type tex2Dgather(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, int comp=0)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex2dgather_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2Dgather_v2", &type_dummy, &retval, t, x, y, comp);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+template<typename T> struct __nv_tex2dgather_rmnf_ret { };
+template<> struct __nv_tex2dgather_rmnf_ret<char> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<signed char> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<unsigned char> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<char4> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<uchar4> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<signed short> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<unsigned short> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort1> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort2> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort3> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<short4> { typedef float4 type; };
+template<> struct __nv_tex2dgather_rmnf_ret<ushort4> { typedef float4 type; };
+
+template <typename T>
+static __device__ __forceinline__  typename __nv_tex2dgather_rmnf_ret<T>::type tex2Dgather(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, int comp = 0)
+{  
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex2dgather_rmnf_ret<T>::type  retval;
+  __nv_tex_surf_handler("__tex2Dgather_rmnf_v2", &type_dummy, &retval, t, x, y, comp);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+// tex1DLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLod(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLod(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DLod_rmnf_v2", &type_dummy, &retval, t, x, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex2DLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLod(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLod(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DLod_rmnf_v2", &type_dummy, &retval, t, x, y, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex1DLayeredLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayeredLod(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, layer, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayeredLod(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, layer, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex2DLayeredLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayeredLod(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, layer, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayeredLod(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, layer, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex3DLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3DLod(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex3DLod_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3DLod(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex3DLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// texCubemapLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLod(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapLod_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLod(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+// texCubemapLayered
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayered(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer)
+{
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapLayered_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayered(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer)
+{
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapLayered_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer);
+  return retval;
+#endif /* __CUDA_ARCH__ */
+}
+
+
+// texCubemapLayeredLod
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayeredLod(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapLayeredLod_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer, level);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayeredLod(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer, float level)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapLayeredLod_rmnf_v2", &type_dummy, &retval, t, x, y, z, layer, level);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+// texCubemapGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapGrad(texture<T, cudaTextureTypeCubemap, cudaReadModeElementType> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapGrad(texture<T, cudaTextureTypeCubemap, cudaReadModeNormalizedFloat> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapGrad_rmnf_v2", &type_dummy, &retval, t,  x, y, z, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+// texCubemapLayeredGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type texCubemapLayeredGrad(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeElementType> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__texCubemapLayeredGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, layer, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type texCubemapLayeredGrad(texture<T, cudaTextureTypeCubemapLayered, cudaReadModeNormalizedFloat> t, float x, float y, float z, int layer, float4 dPdx, float4 dPdy) 
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__texCubemapLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, layer, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+// tex1DGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DGrad(texture<T, cudaTextureType1D, cudaReadModeElementType> t, float x, float dPdx, float dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, dPdx, dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DGrad(texture<T, cudaTextureType1D, cudaReadModeNormalizedFloat> t, float x, float dPdx, float dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DGrad_rmnf_v2", &type_dummy, &retval,t, x,dPdx, dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+
+// tex2DGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DGrad(texture<T, cudaTextureType2D, cudaReadModeElementType> t, float x, float y, float2 dPdx, float2 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DGrad(texture<T, cudaTextureType2D, cudaReadModeNormalizedFloat> t, float x, float y, float2 dPdx, float2 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex1DLayeredGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex1DLayeredGrad(texture<T, cudaTextureType1DLayered, cudaReadModeElementType> t, float x, int layer, float dPdx, float dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex1DLayeredGrad_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, layer, dPdx, dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex1DLayeredGrad(texture<T, cudaTextureType1DLayered, cudaReadModeNormalizedFloat> t, float x, int layer, float dPdx, float dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex1DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, layer, dPdx, dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex2DLayeredGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex2DLayeredGrad(texture<T, cudaTextureType2DLayered, cudaReadModeElementType> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex2DLayeredGrad_v2",(typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, layer, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex2DLayeredGrad(texture<T, cudaTextureType2DLayered, cudaReadModeNormalizedFloat> t, float x, float y, int layer, float2 dPdx, float2 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex2DLayeredGrad_rmnf_v2", &type_dummy, &retval,t, x, y, layer, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+// tex3DGrad
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmet_ret<T>::type tex3DGrad(texture<T, cudaTextureType3D, cudaReadModeElementType> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  typename __nv_tex_rmet_ret<T>::type temp;
+  __nv_tex_surf_handler("__tex3DGrad_v2", (typename __nv_tex_rmet_cast<T>::type)&temp, t, x, y, z, &dPdx, &dPdy);
+  return temp;
+#endif
+}
+
+template <typename T>
+static __DEPRECATED__ __forceinline__ __device__ typename __nv_tex_rmnf_ret<T>::type tex3DGrad(texture<T, cudaTextureType3D, cudaReadModeNormalizedFloat> t, float x, float y, float z, float4 dPdx, float4 dPdy)
+{ 
+#ifdef __CUDA_ARCH__
+  T type_dummy;
+  typename __nv_tex_rmnf_ret<T>::type retval;
+  __nv_tex_surf_handler("__tex3DGrad_rmnf_v2", &type_dummy, &retval,t, x, y, z, &dPdx, &dPdy);
+  return retval;
+#endif /* __CUDA_ARCH__ */ 
+}
+
+#undef __DEPRECATED__
+
+#endif /* __cplusplus && __CUDACC__ */
+
+#endif /* !__TEXTURE_FETCH_FUNCTIONS_H__ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0fceb1208fe38192addd97bed1f2c19b0eebd61
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d3e341c8ac1735ad16deb34995ac4c0902da053
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*   cudnn : Neural Networks Library
+
+*/
+
+#if !defined(CUDNN_H_)
+#define CUDNN_H_
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+#include "cudnn_ops_train.h"
+#include "cudnn_adv_infer.h"
+#include "cudnn_adv_train.h"
+#include "cudnn_cnn_infer.h"
+#include "cudnn_cnn_train.h"
+
+#include "cudnn_backend.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer.h
new file mode 100644
index 0000000000000000000000000000000000000000..1aa47bbc71d664de3af742f1c5223b149ee5d3f3
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_infer.h
@@ -0,0 +1,658 @@
+/*
+ * Copyright 2017-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*   cudnn_adv_infer : cuDNN's advanced and experimental features.
+
+*/
+
+#if !defined(CUDNN_ADV_INFER_H_)
+#define CUDNN_ADV_INFER_H_
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_ops_infer.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_ADV_INFER_MAJOR 8
+#define CUDNN_ADV_INFER_MINOR 7
+#define CUDNN_ADV_INFER_PATCH 0
+
+#if (CUDNN_ADV_INFER_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_INFER_MINOR != CUDNN_MINOR) || \
+    (CUDNN_ADV_INFER_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN ADV INFER!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* BASIC RNN API */
+
+typedef enum {
+    CUDNN_FWD_MODE_INFERENCE = 0,
+    CUDNN_FWD_MODE_TRAINING  = 1,
+} cudnnForwardMode_t;
+
+typedef enum {
+    CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
+    CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
+    CUDNN_LSTM     = 2, /* LSTM with optional recurrent projection and clipping */
+    CUDNN_GRU      = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
+} cudnnRNNMode_t;
+
+typedef enum {
+    CUDNN_RNN_NO_BIAS         = 0, /* rnn cell formulas do not use biases */
+    CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
+    CUDNN_RNN_DOUBLE_BIAS     = 2, /* default, rnn cell formulas use two bias vectors */
+    CUDNN_RNN_SINGLE_REC_BIAS = 3  /* rnn cell formulas use one recurrent bias in recurrent GEMM */
+} cudnnRNNBiasMode_t;
+
+typedef enum {
+    CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
+    CUDNN_BIDIRECTIONAL  = 1, /* output concatination at each layer */
+} cudnnDirectionMode_t;
+
+typedef enum {
+    CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
+    CUDNN_SKIP_INPUT   = 1, /* fixed identity matrix in the first layer input GEMM */
+} cudnnRNNInputMode_t;
+
+typedef enum {
+    CUDNN_RNN_CLIP_NONE   = 0, /* disables LSTM cell clipping */
+    CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
+} cudnnRNNClipMode_t;
+
+typedef enum {
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED   = 0, /* padded, outer stride from one time-step to the next */
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED     = 1, /* sequence length sorted and packed as in basic RNN api */
+    CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
+} cudnnRNNDataLayout_t;
+
+/* Legacy type for backward compatibility */
+typedef unsigned cudnnRNNPaddingMode_t;
+
+/* For auxFlags in cudnnSetRNNDescriptor_v8() and cudnnSetRNNPaddingMode() */
+#define CUDNN_RNN_PADDED_IO_DISABLED 0
+#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
+
+struct cudnnRNNStruct;
+typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
+
+struct cudnnPersistentRNNPlan;
+typedef struct cudnnPersistentRNNPlan *cudnnPersistentRNNPlan_t;
+
+struct cudnnRNNDataStruct;
+typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t algo,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNBiasMode_t biasMode,
+                         cudnnDirectionMode_t dirMode,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDataType_t dataType,
+                         cudnnDataType_t mathPrec,
+                         cudnnMathType_t mathType,
+                         int32_t inputSize,
+                         int32_t hiddenSize,
+                         int32_t projSize,
+                         int32_t numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         uint32_t auxFlags);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNBiasMode_t *biasMode,
+                         cudnnDirectionMode_t *dirMode,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDataType_t *dataType,
+                         cudnnDataType_t *mathPrec,
+                         cudnnMathType_t *mathType,
+                         int32_t *inputSize,
+                         int32_t *hiddenSize,
+                         int32_t *projSize,
+                         int32_t *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         uint32_t *auxFlags);
+
+/*
+ * mathPrec in cudnnSetRNNDescriptor_v6() specifies compute precision
+ * compute precision is further modified by cudnnSetRNNMatrixMathType()
+ * dataType in cudnnGetRNNParamsSize() and wDesc specify weight storage
+ * dropout is between RNN layers, not between recurrent steps
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v6(cudnnHandle_t handle,
+                         cudnnRNNDescriptor_t rnnDesc,
+                         const int hiddenSize,
+                         const int numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDirectionMode_t direction,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNAlgo_t algo,
+                         cudnnDataType_t mathPrec);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v6(cudnnHandle_t handle,
+                         cudnnRNNDescriptor_t rnnDesc,
+                         int *hiddenSize,
+                         int *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDirectionMode_t *direction,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnDataType_t *mathPrec);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t mType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNMatrixMathType(cudnnRNNDescriptor_t rnnDesc, cudnnMathType_t *mType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t biasMode);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNBiasMode(cudnnRNNDescriptor_t rnnDesc, cudnnRNNBiasMode_t *biasMode);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t clipMode,
+                   cudnnNanPropagation_t clipNanOpt,
+                   double lclip,
+                   double rclip);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t *clipMode,
+                   cudnnNanPropagation_t *clipNanOpt,
+                   double *lclip,
+                   double *rclip);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t clipMode,
+                cudnnNanPropagation_t clipNanOpt,
+                double lclip,
+                double rclip);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnRNNClipMode_t *clipMode,
+                cudnnNanPropagation_t *clipNanOpt,
+                double *lclip,
+                double *rclip);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNProjectionLayers(cudnnHandle_t handle,
+                            cudnnRNNDescriptor_t rnnDesc,
+                            const int recProjSize,
+                            const int outProjSize);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNProjectionLayers(cudnnHandle_t handle,
+                            const cudnnRNNDescriptor_t rnnDesc,
+                            int *recProjSize,
+                            int *outProjSize);
+
+/* Expensive. Creates the plan for the specific settings. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreatePersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc,
+                             const int minibatch,
+                             const cudnnDataType_t dataType,
+                             cudnnPersistentRNNPlan_t *plan);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPersistentRNNPlan(cudnnPersistentRNNPlan_t plan);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPersistentRNNPlan(cudnnRNNDescriptor_t rnnDesc, cudnnPersistentRNNPlan_t plan);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
+
+/* dataType in weight descriptors and input descriptors is used to describe storage */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWorkspaceSize(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         size_t *sizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTrainingReserveSize(cudnnHandle_t handle,
+                               const cudnnRNNDescriptor_t rnnDesc,
+                               const int seqLength,
+                               const cudnnTensorDescriptor_t *xDesc,
+                               size_t *sizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
+                          cudnnRNNDescriptor_t rnnDesc,
+                          cudnnForwardMode_t fMode,
+                          cudnnRNNDataDescriptor_t xDesc,
+                          size_t *workSpaceSize,
+                          size_t *reserveSpaceSize);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNParamsSize(cudnnHandle_t handle,
+                      const cudnnRNNDescriptor_t rnnDesc,
+                      const cudnnTensorDescriptor_t xDesc,
+                      size_t *sizeInBytes,
+                      cudnnDataType_t dataType);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerMatrixParams(cudnnHandle_t handle,
+                                const cudnnRNNDescriptor_t rnnDesc,
+                                const int pseudoLayer,
+                                const cudnnTensorDescriptor_t xDesc,
+                                const cudnnFilterDescriptor_t wDesc,
+                                const void *w,
+                                const int linLayerID,
+                                cudnnFilterDescriptor_t linLayerMatDesc,
+                                void **linLayerMat);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNLinLayerBiasParams(cudnnHandle_t handle,
+                              const cudnnRNNDescriptor_t rnnDesc,
+                              const int pseudoLayer,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const cudnnFilterDescriptor_t wDesc,
+                              const void *w,
+                              const int linLayerID,
+                              cudnnFilterDescriptor_t linLayerBiasDesc,
+                              void **linLayerBias);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightParams(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        int32_t pseudoLayer,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        int32_t linLayerID,
+                        cudnnTensorDescriptor_t mDesc,
+                        void **mAddr,
+                        cudnnTensorDescriptor_t bDesc,
+                        void **bAddr);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInference(cudnnHandle_t handle,
+                         const cudnnRNNDescriptor_t rnnDesc,
+                         const int seqLength,
+                         const cudnnTensorDescriptor_t *xDesc,
+                         const void *x,
+                         const cudnnTensorDescriptor_t hxDesc,
+                         const void *hx,
+                         const cudnnTensorDescriptor_t cxDesc,
+                         const void *cx,
+                         const cudnnFilterDescriptor_t wDesc,
+                         const void *w,
+                         const cudnnTensorDescriptor_t *yDesc,
+                         void *y,
+                         const cudnnTensorDescriptor_t hyDesc,
+                         void *hy,
+                         const cudnnTensorDescriptor_t cyDesc,
+                         void *cy,
+                         void *workSpace,
+                         size_t workSpaceSizeInBytes);
+
+/* RNN EX API */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned paddingMode);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNPaddingMode(cudnnRNNDescriptor_t rnnDesc, unsigned *paddingMode);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill);         /* symbol for filling padding position in output */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNForwardInferenceEx(cudnnHandle_t handle,
+                           const cudnnRNNDescriptor_t rnnDesc,
+                           const cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           const cudnnTensorDescriptor_t hxDesc,
+                           const void *hx,
+                           const cudnnTensorDescriptor_t cxDesc,
+                           const void *cx,
+                           const cudnnFilterDescriptor_t wDesc,
+                           const void *w,
+                           const cudnnRNNDataDescriptor_t yDesc,
+                           void *y,
+                           const cudnnTensorDescriptor_t hyDesc,
+                           void *hy,
+                           const cudnnTensorDescriptor_t cyDesc,
+                           void *cy,
+                           const cudnnRNNDataDescriptor_t kDesc, /* reserved, should pass NULL */
+                           const void *keys,                     /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t cDesc, /* reserved, should pass NULL */
+                           void *cAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t iDesc, /* reserved, should pass NULL */
+                           void *iAttn,                          /* reserved, should pass NULL */
+                           const cudnnRNNDataDescriptor_t qDesc, /* reserved, should pass NULL */
+                           void *queries,                        /* reserved, should pass NULL */
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForward(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnForwardMode_t fwdMode,
+                const int32_t devSeqLengths[],
+                cudnnRNNDataDescriptor_t xDesc,
+                const void *x,
+                cudnnRNNDataDescriptor_t yDesc,
+                void *y,
+                cudnnTensorDescriptor_t hDesc,
+                const void *hx,
+                void *hy,
+                cudnnTensorDescriptor_t cDesc,
+                const void *cx,
+                void *cy,
+                size_t weightSpaceSize,
+                const void *weightSpace,
+                size_t workSpaceSize,
+                void *workSpace,
+                size_t reserveSpaceSize,
+                void *reserveSpace);
+
+/* RNN FIND API */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNAlgorithmDescriptor(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, cudnnAlgorithmDescriptor_t algoDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNForwardInferenceAlgorithmMaxCount(cudnnHandle_t handle, const cudnnRNNDescriptor_t rnnDesc, int *count);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnFindRNNForwardInferenceAlgorithmEx(cudnnHandle_t handle,
+                                        const cudnnRNNDescriptor_t rnnDesc,
+                                        const int seqLength,
+                                        const cudnnTensorDescriptor_t *xDesc,
+                                        const void *x,
+                                        const cudnnTensorDescriptor_t hxDesc,
+                                        const void *hx,
+                                        const cudnnTensorDescriptor_t cxDesc,
+                                        const void *cx,
+                                        const cudnnFilterDescriptor_t wDesc,
+                                        const void *w,
+                                        const cudnnTensorDescriptor_t *yDesc,
+                                        void *y,
+                                        const cudnnTensorDescriptor_t hyDesc,
+                                        void *hy,
+                                        const cudnnTensorDescriptor_t cyDesc,
+                                        void *cy,
+                                        const float findIntensity,
+                                        const int requestedAlgoCount,
+                                        int *returnedAlgoCount,
+                                        cudnnAlgorithmPerformance_t *perfResults,
+                                        void *workspace,
+                                        size_t workSpaceSizeInBytes);
+
+/* Sequence data descriptor */
+
+typedef enum {
+    CUDNN_SEQDATA_TIME_DIM  = 0, /* index in time */
+    CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
+    CUDNN_SEQDATA_BEAM_DIM  = 2, /* index in beam */
+    CUDNN_SEQDATA_VECT_DIM  = 3  /* index in vector */
+} cudnnSeqDataAxis_t;
+
+struct cudnnSeqDataStruct;
+typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t;
+
+#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t dataType,
+                          int nbDims,
+                          const int dimA[],
+                          const cudnnSeqDataAxis_t axes[],
+                          size_t seqLengthArraySize,
+                          const int seqLengthArray[],
+                          void *paddingFill);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t *dataType,
+                          int *nbDims,
+                          int nbDimsRequested,
+                          int dimA[],
+                          cudnnSeqDataAxis_t axes[],
+                          size_t *seqLengthArraySize,
+                          size_t seqLengthSizeRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+
+/* Multihead Attention */
+
+/* Legacy type for backward compatibility */
+typedef unsigned cudnnAttnQueryMap_t;
+
+/*
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
+ * Use the bitwise OR operator to combine several settings listed below.  Additional
+ * minor options can be added here w/o changing or introducing new API functions.
+ */
+#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0         /* multiple Q-s map to a single (K,V) set when beam size > 1 */
+#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
+#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0         /* no biases in attention input and output projections */
+#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1)  /* use biases in attention input and output projections */
+
+struct cudnnAttnStruct;
+typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned attnMode,
+                       int nHeads,
+                       double smScaler,
+                       cudnnDataType_t dataType,
+                       cudnnDataType_t computePrec,
+                       cudnnMathType_t mathType,
+                       cudnnDropoutDescriptor_t attnDropoutDesc,
+                       cudnnDropoutDescriptor_t postDropoutDesc,
+                       int qSize,
+                       int kSize,
+                       int vSize,
+                       int qProjSize,
+                       int kProjSize,
+                       int vProjSize,
+                       int oProjSize,
+                       int qoMaxSeqLength,
+                       int kvMaxSeqLength,
+                       int maxBatchSize,
+                       int maxBeamSize);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned *attnMode,
+                       int *nHeads,
+                       double *smScaler,
+                       cudnnDataType_t *dataType,
+                       cudnnDataType_t *computePrec,
+                       cudnnMathType_t *mathType,
+                       cudnnDropoutDescriptor_t *attnDropoutDesc,
+                       cudnnDropoutDescriptor_t *postDropoutDesc,
+                       int *qSize,
+                       int *kSize,
+                       int *vSize,
+                       int *qProjSize,
+                       int *kProjSize,
+                       int *vProjSize,
+                       int *oProjSize,
+                       int *qoMaxSeqLength,
+                       int *kvMaxSeqLength,
+                       int *maxBatchSize,
+                       int *maxBeamSize);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             size_t *weightSizeInBytes,
+                             size_t *workSpaceSizeInBytes,
+                             size_t *reserveSpaceSizeInBytes);
+
+typedef enum {
+    CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
+    CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
+    CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
+    CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
+    CUDNN_MH_ATTN_Q_BIASES  = 4, /* input projection bias tensor for 'queries' */
+    CUDNN_MH_ATTN_K_BIASES  = 5, /* input projection bias for 'keys' */
+    CUDNN_MH_ATTN_V_BIASES  = 6, /* input projection bias for 'values' */
+    CUDNN_MH_ATTN_O_BIASES  = 7, /* output projection biases */
+} cudnnMultiHeadAttnWeightKind_t;
+
+#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             cudnnMultiHeadAttnWeightKind_t wKind,
+                             size_t weightSizeInBytes,
+                             const void *weights,
+                             cudnnTensorDescriptor_t wDesc,
+                             void **wAddr);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnForward(cudnnHandle_t handle,
+                          const cudnnAttnDescriptor_t attnDesc,
+                          int currIdx,
+                          const int loWinIdx[],
+                          const int hiWinIdx[],
+                          const int devSeqLengthsQO[],
+                          const int devSeqLengthsKV[],
+                          const cudnnSeqDataDescriptor_t qDesc,
+                          const void *queries,
+                          const void *residuals,
+                          const cudnnSeqDataDescriptor_t kDesc,
+                          const void *keys,
+                          const cudnnSeqDataDescriptor_t vDesc,
+                          const void *values,
+                          const cudnnSeqDataDescriptor_t oDesc,
+                          void *out,
+                          size_t weightSizeInBytes,
+                          const void *weights,
+                          size_t workSpaceSizeInBytes,
+                          void *workSpace,
+                          size_t reserveSpaceSizeInBytes,
+                          void *reserveSpace);
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvInferVersionCheck(void);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_ADV_INFER_H_ */
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b0197794116f93c1304639cf1de950e1db686e4
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h
@@ -0,0 +1,6840 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(CUSPARSE_H_)
+#define CUSPARSE_H_
+
+#include <cuComplex.h>
+#include <cuda_fp16.h>
+#include <driver_types.h>
+#include <library_types.h>
+#include <stdint.h>
+#include <stdio.h>
+
+//##############################################################################
+//# CUSPARSE VERSION INFORMATION
+//##############################################################################
+
+#define CUSPARSE_VER_MAJOR 11
+#define CUSPARSE_VER_MINOR 7
+#define CUSPARSE_VER_PATCH 5
+#define CUSPARSE_VER_BUILD 86
+#define CUSPARSE_VERSION (CUSPARSE_VER_MAJOR * 1000 + \
+                          CUSPARSE_VER_MINOR *  100 + \
+                          CUSPARSE_VER_PATCH)
+
+// #############################################################################
+// # BASIC MACROS
+// #############################################################################
+
+#if !defined(CUSPARSEAPI)
+#    if defined(_WIN32)
+#        define CUSPARSEAPI __stdcall
+#    else
+#        define CUSPARSEAPI
+#    endif
+#endif
+
+//------------------------------------------------------------------------------
+
+#if !defined(_MSC_VER)
+#   define CUSPARSE_CPP_VERSION __cplusplus
+#elif _MSC_FULL_VER >= 190024210 // Visual Studio 2015 Update 3
+#   define CUSPARSE_CPP_VERSION _MSVC_LANG
+#else
+#   define CUSPARSE_CPP_VERSION 0
+#endif
+
+// #############################################################################
+// # CUSPARSE_DEPRECATED MACRO
+// #############################################################################
+
+#if !defined(DISABLE_CUSPARSE_DEPRECATED)
+
+#   if CUSPARSE_CPP_VERSION >= 201402L
+
+#       define CUSPARSE_DEPRECATED(new_func)                                   \
+            [[deprecated("please use " #new_func " instead")]]
+
+#   elif defined(_MSC_VER)
+
+#       define CUSPARSE_DEPRECATED(new_func)                                   \
+            __declspec(deprecated("please use " #new_func " instead"))
+
+#   elif defined(__INTEL_COMPILER) || defined(__clang__) ||                    \
+         (defined(__GNUC__) &&                                                 \
+          (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+
+#       define CUSPARSE_DEPRECATED(new_func)                                   \
+            __attribute__((deprecated("please use " #new_func " instead")))
+
+#   elif defined(__GNUC__) || defined(__xlc__)
+
+#       define CUSPARSE_DEPRECATED(new_func)                                   \
+            __attribute__((deprecated))
+
+#   else
+
+#       define CUSPARSE_DEPRECATED(new_func)
+
+#   endif // defined(__cplusplus) && __cplusplus >= 201402L
+//------------------------------------------------------------------------------
+
+#   if CUSPARSE_CPP_VERSION >= 201703L
+
+#       define CUSPARSE_DEPRECATED_ENUM(new_enum)                              \
+            [[deprecated("please use " #new_enum " instead")]]
+
+#   elif defined(__clang__) ||                                                 \
+         (defined(__GNUC__) && __GNUC__ >= 6 && !defined(__PGI))
+
+#       define CUSPARSE_DEPRECATED_ENUM(new_enum)                              \
+            __attribute__((deprecated("please use " #new_enum " instead")))
+
+#   else
+
+#       define CUSPARSE_DEPRECATED_ENUM(new_enum)
+
+#   endif // defined(__cplusplus) && __cplusplus >= 201402L
+
+#else // defined(DISABLE_CUSPARSE_DEPRECATED)
+
+#   define CUSPARSE_DEPRECATED(new_func)
+#   define CUSPARSE_DEPRECATED_ENUM(new_enum)
+
+#endif // !defined(DISABLE_CUSPARSE_DEPRECATED)
+
+#undef CUSPARSE_CPP_VERSION
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus)
+extern "C" {
+#endif // defined(__cplusplus)
+
+//##############################################################################
+//# OPAQUE DATA STRUCTURES
+//##############################################################################
+
+struct cusparseContext;
+typedef struct cusparseContext* cusparseHandle_t;
+
+struct cusparseMatDescr;
+typedef struct cusparseMatDescr* cusparseMatDescr_t;
+
+struct csrsv2Info;
+typedef struct csrsv2Info* csrsv2Info_t;
+
+struct csrsm2Info;
+typedef struct csrsm2Info* csrsm2Info_t;
+
+struct bsrsv2Info;
+typedef struct bsrsv2Info* bsrsv2Info_t;
+
+struct bsrsm2Info;
+typedef struct bsrsm2Info* bsrsm2Info_t;
+
+struct csric02Info;
+typedef struct csric02Info* csric02Info_t;
+
+struct bsric02Info;
+typedef struct bsric02Info* bsric02Info_t;
+
+struct csrilu02Info;
+typedef struct csrilu02Info* csrilu02Info_t;
+
+struct bsrilu02Info;
+typedef struct bsrilu02Info* bsrilu02Info_t;
+
+struct csrgemm2Info;
+typedef struct csrgemm2Info* csrgemm2Info_t;
+
+struct csru2csrInfo;
+typedef struct csru2csrInfo* csru2csrInfo_t;
+
+struct cusparseColorInfo;
+typedef struct cusparseColorInfo* cusparseColorInfo_t;
+
+struct pruneInfo;
+typedef struct pruneInfo* pruneInfo_t;
+
+//##############################################################################
+//# ENUMERATORS
+//##############################################################################
+
+typedef enum {
+    CUSPARSE_STATUS_SUCCESS                   = 0,
+    CUSPARSE_STATUS_NOT_INITIALIZED           = 1,
+    CUSPARSE_STATUS_ALLOC_FAILED              = 2,
+    CUSPARSE_STATUS_INVALID_VALUE             = 3,
+    CUSPARSE_STATUS_ARCH_MISMATCH             = 4,
+    CUSPARSE_STATUS_MAPPING_ERROR             = 5,
+    CUSPARSE_STATUS_EXECUTION_FAILED          = 6,
+    CUSPARSE_STATUS_INTERNAL_ERROR            = 7,
+    CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8,
+    CUSPARSE_STATUS_ZERO_PIVOT                = 9,
+    CUSPARSE_STATUS_NOT_SUPPORTED             = 10,
+    CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    = 11
+} cusparseStatus_t;
+
+typedef enum {
+    CUSPARSE_POINTER_MODE_HOST   = 0,
+    CUSPARSE_POINTER_MODE_DEVICE = 1
+} cusparsePointerMode_t;
+
+typedef enum {
+    CUSPARSE_ACTION_SYMBOLIC = 0,
+    CUSPARSE_ACTION_NUMERIC  = 1
+} cusparseAction_t;
+
+typedef enum {
+    CUSPARSE_MATRIX_TYPE_GENERAL    = 0,
+    CUSPARSE_MATRIX_TYPE_SYMMETRIC  = 1,
+    CUSPARSE_MATRIX_TYPE_HERMITIAN  = 2,
+    CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3
+} cusparseMatrixType_t;
+
+typedef enum {
+    CUSPARSE_FILL_MODE_LOWER = 0,
+    CUSPARSE_FILL_MODE_UPPER = 1
+} cusparseFillMode_t;
+
+typedef enum {
+    CUSPARSE_DIAG_TYPE_NON_UNIT = 0,
+    CUSPARSE_DIAG_TYPE_UNIT     = 1
+} cusparseDiagType_t;
+
+typedef enum {
+    CUSPARSE_INDEX_BASE_ZERO = 0,
+    CUSPARSE_INDEX_BASE_ONE  = 1
+} cusparseIndexBase_t;
+
+typedef enum {
+    CUSPARSE_OPERATION_NON_TRANSPOSE       = 0,
+    CUSPARSE_OPERATION_TRANSPOSE           = 1,
+    CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
+} cusparseOperation_t;
+
+typedef enum {
+    CUSPARSE_DIRECTION_ROW    = 0,
+    CUSPARSE_DIRECTION_COLUMN = 1
+} cusparseDirection_t;
+
+typedef enum {
+    CUSPARSE_SOLVE_POLICY_NO_LEVEL = 0,
+    CUSPARSE_SOLVE_POLICY_USE_LEVEL = 1
+} cusparseSolvePolicy_t;
+
+typedef enum {
+    CUSPARSE_COLOR_ALG0 = 0, // default
+    CUSPARSE_COLOR_ALG1 = 1
+} cusparseColorAlg_t;
+
+typedef enum {
+    CUSPARSE_ALG_MERGE_PATH // merge path alias
+} cusparseAlgMode_t;
+
+//##############################################################################
+//# INITIALIZATION AND MANAGEMENT ROUTINES
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreate(cusparseHandle_t* handle);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroy(cusparseHandle_t handle);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetVersion(cusparseHandle_t handle,
+                   int*             version);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetProperty(libraryPropertyType type,
+                    int*                value);
+
+const char* CUSPARSEAPI
+cusparseGetErrorName(cusparseStatus_t status);
+
+const char* CUSPARSEAPI
+cusparseGetErrorString(cusparseStatus_t status);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetStream(cusparseHandle_t handle,
+                  cudaStream_t     streamId);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetStream(cusparseHandle_t handle,
+                  cudaStream_t*    streamId);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetPointerMode(cusparseHandle_t       handle,
+                       cusparsePointerMode_t* mode);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetPointerMode(cusparseHandle_t      handle,
+                       cusparsePointerMode_t mode);
+
+//##############################################################################
+//# LOGGING APIs
+//##############################################################################
+
+typedef void (*cusparseLoggerCallback_t)(int         logLevel,
+                                         const char* functionName,
+                                         const char* message);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerSetCallback(cusparseLoggerCallback_t callback);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerSetFile(FILE* file);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerOpenFile(const char* logFile);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerSetLevel(int level);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerSetMask(int mask);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerForceDisable(void);
+
+//##############################################################################
+//# HELPER ROUTINES
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateMatDescr(cusparseMatDescr_t* descrA);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyMatDescr(cusparseMatDescr_t descrA);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCopyMatDescr(cusparseMatDescr_t       dest,
+                     const cusparseMatDescr_t src);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatType(cusparseMatDescr_t   descrA,
+                   cusparseMatrixType_t type);
+
+cusparseMatrixType_t CUSPARSEAPI
+cusparseGetMatType(const cusparseMatDescr_t descrA);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatFillMode(cusparseMatDescr_t descrA,
+                       cusparseFillMode_t fillMode);
+
+cusparseFillMode_t CUSPARSEAPI
+cusparseGetMatFillMode(const cusparseMatDescr_t descrA);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatDiagType(cusparseMatDescr_t descrA,
+                       cusparseDiagType_t diagType);
+
+cusparseDiagType_t CUSPARSEAPI
+cusparseGetMatDiagType(const cusparseMatDescr_t descrA);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatIndexBase(cusparseMatDescr_t  descrA,
+                        cusparseIndexBase_t base);
+
+cusparseIndexBase_t CUSPARSEAPI
+cusparseGetMatIndexBase(const cusparseMatDescr_t descrA);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsrsv2Info(csrsv2Info_t* info);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsrsv2Info(csrsv2Info_t info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsric02Info(csric02Info_t* info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsric02Info(csric02Info_t info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsric02Info(bsric02Info_t* info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyBsric02Info(bsric02Info_t info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsrilu02Info(csrilu02Info_t* info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsrilu02Info(csrilu02Info_t info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsrilu02Info(bsrilu02Info_t* info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyBsrilu02Info(bsrilu02Info_t info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsrsv2Info(bsrsv2Info_t* info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyBsrsv2Info(bsrsv2Info_t info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsrsm2Info(bsrsm2Info_t* info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyBsrsm2Info(bsrsm2Info_t info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsru2csrInfo(csru2csrInfo_t* info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsru2csrInfo(csru2csrInfo_t info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateColorInfo(cusparseColorInfo_t* info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyColorInfo(cusparseColorInfo_t info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetColorAlgs(cusparseColorInfo_t info,
+                     cusparseColorAlg_t  alg);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetColorAlgs(cusparseColorInfo_t info,
+                     cusparseColorAlg_t* alg);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreatePruneInfo(pruneInfo_t* info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyPruneInfo(pruneInfo_t info);
+
+//##############################################################################
+//# SPARSE LEVEL 1 ROUTINES
+//##############################################################################
+
+CUSPARSE_DEPRECATED(cusparseAxpby)
+cusparseStatus_t CUSPARSEAPI
+cusparseSaxpyi(cusparseHandle_t    handle,
+               int                 nnz,
+               const float*        alpha,
+               const float*        xVal,
+               const int*          xInd,
+               float*              y,
+               cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseAxpby)
+cusparseStatus_t CUSPARSEAPI
+cusparseDaxpyi(cusparseHandle_t    handle,
+               int                 nnz,
+               const double*       alpha,
+               const double*       xVal,
+               const int*          xInd,
+               double*             y,
+               cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseAxpby)
+cusparseStatus_t CUSPARSEAPI
+cusparseCaxpyi(cusparseHandle_t    handle,
+               int                 nnz,
+               const cuComplex*    alpha,
+               const cuComplex*    xVal,
+               const int*          xInd,
+               cuComplex*          y,
+               cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseAxpby)
+cusparseStatus_t CUSPARSEAPI
+cusparseZaxpyi(cusparseHandle_t       handle,
+               int                    nnz,
+               const cuDoubleComplex* alpha,
+               const cuDoubleComplex* xVal,
+               const int*             xInd,
+               cuDoubleComplex*       y,
+               cusparseIndexBase_t    idxBase);
+
+CUSPARSE_DEPRECATED(cusparseGather)
+cusparseStatus_t CUSPARSEAPI
+cusparseSgthr(cusparseHandle_t    handle,
+              int                 nnz,
+              const float*        y,
+              float*              xVal,
+              const int*          xInd,
+              cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseGather)
+cusparseStatus_t CUSPARSEAPI
+cusparseDgthr(cusparseHandle_t    handle,
+              int                 nnz,
+              const double*       y,
+              double*             xVal,
+              const int*          xInd,
+              cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseGather)
+cusparseStatus_t CUSPARSEAPI
+cusparseCgthr(cusparseHandle_t    handle,
+              int                 nnz,
+              const cuComplex*    y,
+              cuComplex*          xVal,
+              const int*          xInd,
+              cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseGather)
+cusparseStatus_t CUSPARSEAPI
+cusparseZgthr(cusparseHandle_t       handle,
+              int                    nnz,
+              const cuDoubleComplex* y,
+              cuDoubleComplex*       xVal,
+              const int*             xInd,
+              cusparseIndexBase_t    idxBase);
+
+CUSPARSE_DEPRECATED(cusparseGather)
+cusparseStatus_t CUSPARSEAPI
+cusparseSgthrz(cusparseHandle_t    handle,
+               int                 nnz,
+               float*              y,
+               float*              xVal,
+               const int*          xInd,
+               cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseGather)
+cusparseStatus_t CUSPARSEAPI
+cusparseDgthrz(cusparseHandle_t    handle,
+               int                 nnz,
+               double*             y,
+               double*             xVal,
+               const int*          xInd,
+               cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseGather)
+cusparseStatus_t CUSPARSEAPI
+cusparseCgthrz(cusparseHandle_t    handle,
+               int                 nnz,
+               cuComplex*          y,
+               cuComplex*          xVal,
+               const int*          xInd,
+               cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseGather)
+cusparseStatus_t CUSPARSEAPI
+cusparseZgthrz(cusparseHandle_t    handle,
+               int                 nnz,
+               cuDoubleComplex*    y,
+               cuDoubleComplex*    xVal,
+               const int*          xInd,
+               cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseScatter)
+cusparseStatus_t CUSPARSEAPI
+cusparseSsctr(cusparseHandle_t    handle,
+              int                 nnz,
+              const float*        xVal,
+              const int*          xInd,
+              float*              y,
+              cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseScatter)
+cusparseStatus_t CUSPARSEAPI
+cusparseDsctr(cusparseHandle_t    handle,
+              int                 nnz,
+              const double*       xVal,
+              const int*          xInd,
+              double*             y,
+              cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseScatter)
+cusparseStatus_t CUSPARSEAPI
+cusparseCsctr(cusparseHandle_t    handle,
+              int                 nnz,
+              const cuComplex*    xVal,
+              const int*          xInd,
+              cuComplex*          y,
+              cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseScatter)
+cusparseStatus_t CUSPARSEAPI
+cusparseZsctr(cusparseHandle_t       handle,
+              int                    nnz,
+              const cuDoubleComplex* xVal,
+              const int*             xInd,
+              cuDoubleComplex*       y,
+              cusparseIndexBase_t    idxBase);
+
+CUSPARSE_DEPRECATED(cusparseRot)
+cusparseStatus_t CUSPARSEAPI
+cusparseSroti(cusparseHandle_t    handle,
+              int                 nnz,
+              float*              xVal,
+              const int*          xInd,
+              float*              y,
+              const float*        c,
+              const float*        s,
+              cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED(cusparseRot)
+cusparseStatus_t CUSPARSEAPI
+cusparseDroti(cusparseHandle_t    handle,
+              int                 nnz,
+              double*             xVal,
+              const int*          xInd,
+              double*             y,
+              const double*       c,
+              const double*       s,
+              cusparseIndexBase_t idxBase);
+
+//##############################################################################
+//# SPARSE LEVEL 2 ROUTINES
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi(cusparseHandle_t    handle,
+               cusparseOperation_t transA,
+               int                 m,
+               int                 n,
+               const float*        alpha,
+               const float*        A,
+               int                 lda,
+               int                 nnz,
+               const float*        xVal,
+               const int*          xInd,
+               const float*        beta,
+               float*              y,
+               cusparseIndexBase_t idxBase,
+               void*               pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi_bufferSize(cusparseHandle_t    handle,
+                          cusparseOperation_t transA,
+                          int                 m,
+                          int                 n,
+                          int                 nnz,
+                          int*                pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi(cusparseHandle_t    handle,
+               cusparseOperation_t transA,
+               int                 m,
+               int                 n,
+               const double*       alpha,
+               const double*       A,
+               int                 lda,
+               int                 nnz,
+               const double*       xVal,
+               const int*          xInd,
+               const double*       beta,
+               double*             y,
+               cusparseIndexBase_t idxBase,
+               void*               pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi_bufferSize(cusparseHandle_t    handle,
+                          cusparseOperation_t transA,
+                          int                 m,
+                          int                 n,
+                          int                 nnz,
+                          int*                pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgemvi(cusparseHandle_t    handle,
+               cusparseOperation_t transA,
+               int                 m,
+               int                 n,
+               const cuComplex*    alpha,
+               const cuComplex*    A,
+               int                 lda,
+               int                 nnz,
+               const cuComplex*    xVal,
+               const int*          xInd,
+               const cuComplex*    beta,
+               cuComplex*          y,
+               cusparseIndexBase_t idxBase,
+               void*               pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgemvi_bufferSize(cusparseHandle_t    handle,
+                          cusparseOperation_t transA,
+                          int                 m,
+                          int                 n,
+                          int                 nnz,
+                          int*                pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemvi(cusparseHandle_t       handle,
+               cusparseOperation_t    transA,
+               int                    m,
+               int                    n,
+               const cuDoubleComplex* alpha,
+               const cuDoubleComplex* A,
+               int                    lda,
+               int                    nnz,
+               const cuDoubleComplex* xVal,
+               const int*             xInd,
+               const cuDoubleComplex* beta,
+               cuDoubleComplex*       y,
+               cusparseIndexBase_t    idxBase,
+               void*                  pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemvi_bufferSize(cusparseHandle_t    handle,
+                          cusparseOperation_t transA,
+                          int                 m,
+                          int                 n,
+                          int                 nnz,
+                          int*                pBufferSize);
+
+CUSPARSE_DEPRECATED(cusparseSpMV)
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrmvEx_bufferSize(cusparseHandle_t         handle,
+                           cusparseAlgMode_t        alg,
+                           cusparseOperation_t      transA,
+                           int                      m,
+                           int                      n,
+                           int                      nnz,
+                           const void*              alpha,
+                           cudaDataType             alphatype,
+                           const cusparseMatDescr_t descrA,
+                           const void*              csrValA,
+                           cudaDataType             csrValAtype,
+                           const int*               csrRowPtrA,
+                           const int*               csrColIndA,
+                           const void*              x,
+                           cudaDataType             xtype,
+                           const void*              beta,
+                           cudaDataType             betatype,
+                           void*                    y,
+                           cudaDataType             ytype,
+                           cudaDataType             executiontype,
+                           size_t*                  bufferSizeInBytes);
+
+CUSPARSE_DEPRECATED(cusparseSpMV)
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrmvEx(cusparseHandle_t         handle,
+                cusparseAlgMode_t        alg,
+                cusparseOperation_t      transA,
+                int                      m,
+                int                      n,
+                int                      nnz,
+                const void*              alpha,
+                cudaDataType             alphatype,
+                const cusparseMatDescr_t descrA,
+                const void*              csrValA,
+                cudaDataType             csrValAtype,
+                const int*               csrRowPtrA,
+                const int*               csrColIndA,
+                const void*              x,
+                cudaDataType             xtype,
+                const void*              beta,
+                cudaDataType             betatype,
+                void*                    y,
+                cudaDataType             ytype,
+                cudaDataType             executiontype,
+                void*                    buffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrmv(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               int                      mb,
+               int                      nb,
+               int                      nnzb,
+               const float*             alpha,
+               const cusparseMatDescr_t descrA,
+               const float*             bsrSortedValA,
+               const int*               bsrSortedRowPtrA,
+               const int*               bsrSortedColIndA,
+               int                      blockDim,
+               const float*             x,
+               const float*             beta,
+               float*                   y);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrmv(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               int                      mb,
+               int                      nb,
+               int                      nnzb,
+               const double*            alpha,
+               const cusparseMatDescr_t descrA,
+               const double*            bsrSortedValA,
+               const int*               bsrSortedRowPtrA,
+               const int*               bsrSortedColIndA,
+               int                      blockDim,
+               const double*            x,
+               const double*            beta,
+               double*                  y);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrmv(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               int                      mb,
+               int                      nb,
+               int                      nnzb,
+               const cuComplex*         alpha,
+               const cusparseMatDescr_t descrA,
+               const cuComplex*         bsrSortedValA,
+               const int*               bsrSortedRowPtrA,
+               const int*               bsrSortedColIndA,
+               int                      blockDim,
+               const cuComplex*         x,
+               const cuComplex*         beta,
+               cuComplex*               y);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrmv(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               int                      mb,
+               int                      nb,
+               int                      nnzb,
+               const cuDoubleComplex*   alpha,
+               const cusparseMatDescr_t descrA,
+               const cuDoubleComplex*   bsrSortedValA,
+               const int*               bsrSortedRowPtrA,
+               const int*               bsrSortedColIndA,
+               int                      blockDim,
+               const cuDoubleComplex*   x,
+               const cuDoubleComplex*   beta,
+               cuDoubleComplex*         y);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrxmv(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                int                      sizeOfMask,
+                int                      mb,
+                int                      nb,
+                int                      nnzb,
+                const float*             alpha,
+                const cusparseMatDescr_t descrA,
+                const float*             bsrSortedValA,
+                const int*               bsrSortedMaskPtrA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedEndPtrA,
+                const int*               bsrSortedColIndA,
+                int                      blockDim,
+                const float*             x,
+                const float*             beta,
+                float*                   y);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrxmv(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                int                      sizeOfMask,
+                int                      mb,
+                int                      nb,
+                int                      nnzb,
+                const double*            alpha,
+                const cusparseMatDescr_t descrA,
+                const double*            bsrSortedValA,
+                const int*               bsrSortedMaskPtrA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedEndPtrA,
+                const int*               bsrSortedColIndA,
+                int                      blockDim,
+                const double*            x,
+                const double*            beta,
+                double*                  y);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrxmv(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                int                      sizeOfMask,
+                int                      mb,
+                int                      nb,
+                int                      nnzb,
+                const cuComplex*         alpha,
+                const cusparseMatDescr_t descrA,
+                const cuComplex*         bsrSortedValA,
+                const int*               bsrSortedMaskPtrA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedEndPtrA,
+                const int*               bsrSortedColIndA,
+                int                      blockDim,
+                const cuComplex*         x,
+                const cuComplex*         beta,
+                cuComplex*               y);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrxmv(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                int                      sizeOfMask,
+                int                      mb,
+                int                      nb,
+                int                      nnzb,
+                const cuDoubleComplex*   alpha,
+                const cusparseMatDescr_t descrA,
+                const cuDoubleComplex*   bsrSortedValA,
+                const int*               bsrSortedMaskPtrA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedEndPtrA,
+                const int*               bsrSortedColIndA,
+                int                      blockDim,
+                const cuDoubleComplex*   x,
+                const cuDoubleComplex*   beta,
+                cuDoubleComplex*         y);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrsv2_zeroPivot(cusparseHandle_t handle,
+                          csrsv2Info_t     info,
+                          int*             position);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseOperation_t      transA,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           float*                   csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseOperation_t      transA,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           double*                  csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseOperation_t      transA,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           cuComplex*               csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseOperation_t      transA,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           cuDoubleComplex*         csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseOperation_t      transA,
+                              int                      m,
+                              int                      nnz,
+                              const cusparseMatDescr_t descrA,
+                              float*                   csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              csrsv2Info_t             info,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseOperation_t      transA,
+                              int                      m,
+                              int                      nnz,
+                              const cusparseMatDescr_t descrA,
+                              double*                  csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              csrsv2Info_t             info,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseOperation_t      transA,
+                              int                      m,
+                              int                      nnz,
+                              const cusparseMatDescr_t descrA,
+                              cuComplex*               csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              csrsv2Info_t             info,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseOperation_t      transA,
+                              int                      m,
+                              int                      nnz,
+                              const cusparseMatDescr_t descrA,
+                              cuDoubleComplex*         csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              csrsv2Info_t             info,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseOperation_t      transA,
+                         int                      m,
+                         int                      nnz,
+                         const cusparseMatDescr_t descrA,
+                         const float*             csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         csrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseOperation_t      transA,
+                         int                      m,
+                         int                      nnz,
+                         const cusparseMatDescr_t descrA,
+                         const double*            csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         csrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseOperation_t      transA,
+                         int                      m,
+                         int                      nnz,
+                         const cusparseMatDescr_t descrA,
+                         const cuComplex*         csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         csrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseOperation_t      transA,
+                         int                      m,
+                         int                      nnz,
+                         const cusparseMatDescr_t descrA,
+                         const cuDoubleComplex*   csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         csrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseOperation_t      transA,
+                      int                      m,
+                      int                      nnz,
+                      const float*             alpha,
+                      const cusparseMatDescr_t descrA,
+                      const float*             csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      csrsv2Info_t             info,
+                      const float*             f,
+                      float*                   x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseOperation_t      transA,
+                      int                      m,
+                      int                      nnz,
+                      const double*            alpha,
+                      const cusparseMatDescr_t descrA,
+                      const double*            csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      csrsv2Info_t             info,
+                      const double*            f,
+                      double*                  x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseOperation_t      transA,
+                      int                      m,
+                      int                      nnz,
+                      const cuComplex*         alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuComplex*         csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      csrsv2Info_t             info,
+                      const cuComplex*         f,
+                      cuComplex*               x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSV)
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseOperation_t      transA,
+                      int                      m,
+                      int                      nnz,
+                      const cuDoubleComplex*   alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuDoubleComplex*   csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      csrsv2Info_t             info,
+                      const cuDoubleComplex*   f,
+                      cuDoubleComplex*         x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
+                          bsrsv2Info_t     info,
+                          int*             position);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           float*                   bsrSortedValA,
+                           const int*               bsrSortedRowPtrA,
+                           const int*               bsrSortedColIndA,
+                           int                      blockDim,
+                           bsrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           double*                  bsrSortedValA,
+                           const int*               bsrSortedRowPtrA,
+                           const int*               bsrSortedColIndA,
+                           int                      blockDim,
+                           bsrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuComplex*               bsrSortedValA,
+                           const int*               bsrSortedRowPtrA,
+                           const int*               bsrSortedColIndA,
+                           int                      blockDim,
+                           bsrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuDoubleComplex*         bsrSortedValA,
+                           const int*               bsrSortedRowPtrA,
+                           const int*               bsrSortedColIndA,
+                           int                      blockDim,
+                           bsrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              int                      mb,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              float*                   bsrSortedValA,
+                              const int*               bsrSortedRowPtrA,
+                              const int*               bsrSortedColIndA,
+                              int                      blockSize,
+                              bsrsv2Info_t             info,
+                              size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              int                      mb,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              double*                  bsrSortedValA,
+                              const int*               bsrSortedRowPtrA,
+                              const int*               bsrSortedColIndA,
+                              int                      blockSize,
+                              bsrsv2Info_t             info,
+                              size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              int                      mb,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              cuComplex*               bsrSortedValA,
+                              const int*               bsrSortedRowPtrA,
+                              const int*               bsrSortedColIndA,
+                              int                      blockSize,
+                              bsrsv2Info_t             info,
+                              size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              int                      mb,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              cuDoubleComplex*         bsrSortedValA,
+                              const int*               bsrSortedRowPtrA,
+                              const int*               bsrSortedColIndA,
+                              int                      blockSize,
+                              bsrsv2Info_t             info,
+                              size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         int                      mb,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const float*             bsrSortedValA,
+                         const int*               bsrSortedRowPtrA,
+                         const int*               bsrSortedColIndA,
+                         int                      blockDim,
+                         bsrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         int                      mb,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const double*            bsrSortedValA,
+                         const int*               bsrSortedRowPtrA,
+                         const int*               bsrSortedColIndA,
+                         int                      blockDim,
+                         bsrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         int                      mb,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const cuComplex*         bsrSortedValA,
+                         const int*               bsrSortedRowPtrA,
+                         const int*               bsrSortedColIndA,
+                         int                      blockDim,
+                         bsrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         int                      mb,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const cuDoubleComplex*   bsrSortedValA,
+                         const int*               bsrSortedRowPtrA,
+                         const int*               bsrSortedColIndA,
+                         int                      blockDim,
+                         bsrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      int                      mb,
+                      int                      nnzb,
+                      const float*             alpha,
+                      const cusparseMatDescr_t descrA,
+                      const float*             bsrSortedValA,
+                      const int*               bsrSortedRowPtrA,
+                      const int*               bsrSortedColIndA,
+                      int                      blockDim,
+                      bsrsv2Info_t             info,
+                      const float*             f,
+                      float*                   x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      int                      mb,
+                      int                      nnzb,
+                      const double*            alpha,
+                      const cusparseMatDescr_t descrA,
+                      const double*            bsrSortedValA,
+                      const int*               bsrSortedRowPtrA,
+                      const int*               bsrSortedColIndA,
+                      int                      blockDim,
+                      bsrsv2Info_t             info,
+                      const double*            f,
+                      double*                  x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      int                      mb,
+                      int                      nnzb,
+                      const cuComplex*         alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuComplex*         bsrSortedValA,
+                      const int*               bsrSortedRowPtrA,
+                      const int*               bsrSortedColIndA,
+                      int                      blockDim,
+                      bsrsv2Info_t             info,
+                      const cuComplex*         f,
+                      cuComplex*               x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      int                      mb,
+                      int                      nnzb,
+                      const cuDoubleComplex*   alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuDoubleComplex*   bsrSortedValA,
+                      const int*               bsrSortedRowPtrA,
+                      const int*               bsrSortedColIndA,
+                      int                      blockDim,
+                      bsrsv2Info_t             info,
+                      const cuDoubleComplex*   f,
+                      cuDoubleComplex*         x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+//##############################################################################
+//# SPARSE LEVEL 3 ROUTINES
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrmm(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               cusparseOperation_t      transB,
+               int                      mb,
+               int                      n,
+               int                      kb,
+               int                      nnzb,
+               const float*             alpha,
+               const cusparseMatDescr_t descrA,
+               const float* bsrSortedValA,
+               const int*   bsrSortedRowPtrA,
+               const int*   bsrSortedColIndA,
+               const int    blockSize,
+               const float* B,
+               const int    ldb,
+               const float* beta,
+               float*       C,
+               int          ldc);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrmm(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               cusparseOperation_t      transB,
+               int                      mb,
+               int                      n,
+               int                      kb,
+               int                      nnzb,
+               const double*            alpha,
+               const cusparseMatDescr_t descrA,
+               const double* bsrSortedValA,
+               const int*    bsrSortedRowPtrA,
+               const int*    bsrSortedColIndA,
+               const int     blockSize,
+               const double* B,
+               const int     ldb,
+               const double* beta,
+               double*       C,
+               int           ldc);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrmm(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               cusparseOperation_t      transB,
+               int                      mb,
+               int                      n,
+               int                      kb,
+               int                      nnzb,
+               const cuComplex*         alpha,
+               const cusparseMatDescr_t descrA,
+               const cuComplex* bsrSortedValA,
+               const int*       bsrSortedRowPtrA,
+               const int*       bsrSortedColIndA,
+               const int        blockSize,
+               const cuComplex* B,
+               const int        ldb,
+               const cuComplex* beta,
+               cuComplex*       C,
+               int              ldc);
+
+cusparseStatus_t CUSPARSEAPI
+ cusparseZbsrmm(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                cusparseOperation_t      transB,
+                int                      mb,
+                int                      n,
+                int                      kb,
+                int                      nnzb,
+                const cuDoubleComplex*   alpha,
+                const cusparseMatDescr_t descrA,
+                const cuDoubleComplex*   bsrSortedValA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedColIndA,
+                const int                blockSize,
+                const cuDoubleComplex*   B,
+                const int                ldb,
+                const cuDoubleComplex*   beta,
+                cuDoubleComplex*         C,
+                int                      ldc);
+
+CUSPARSE_DEPRECATED(cusparseSpMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemmi(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               int              k,
+               int              nnz,
+               const float*     alpha,
+               const float*     A,
+               int              lda,
+               const float*     cscValB,
+               const int*       cscColPtrB,
+               const int*       cscRowIndB,
+               const float*     beta,
+               float*           C,
+               int              ldc);
+
+CUSPARSE_DEPRECATED(cusparseSpMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemmi(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               int              k,
+               int              nnz,
+               const double*    alpha,
+               const double*    A,
+               int              lda,
+               const double*    cscValB,
+               const int*       cscColPtrB,
+               const int*       cscRowIndB,
+               const double*    beta,
+               double*          C,
+               int              ldc);
+
+CUSPARSE_DEPRECATED(cusparseSpMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseCgemmi(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               int              k,
+               int              nnz,
+               const cuComplex* alpha,
+               const cuComplex* A,
+               int              lda,
+               const cuComplex* cscValB,
+               const int*       cscColPtrB,
+               const int*       cscRowIndB,
+               const cuComplex* beta,
+               cuComplex*       C,
+               int              ldc);
+
+CUSPARSE_DEPRECATED(cusparseSpMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemmi(cusparseHandle_t       handle,
+               int                    m,
+               int                    n,
+               int                    k,
+               int                    nnz,
+               const cuDoubleComplex* alpha,
+               const cuDoubleComplex* A,
+               int                    lda,
+               const cuDoubleComplex* cscValB,
+               const int*             cscColPtrB,
+               const int*             cscRowIndB,
+               const cuDoubleComplex* beta,
+               cuDoubleComplex*       C,
+               int                    ldc);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsrsm2Info(csrsm2Info_t* info);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsrsm2Info(csrsm2Info_t info);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrsm2_zeroPivot(cusparseHandle_t handle,
+                          csrsm2Info_t     info,
+                          int* position);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              int                      algo,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      m,
+                              int                      nrhs,
+                              int                      nnz,
+                              const float*             alpha,
+                              const cusparseMatDescr_t descrA,
+                              const float*             csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              const float*             B,
+                              int                      ldb,
+                              csrsm2Info_t             info,
+                              cusparseSolvePolicy_t    policy,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              int                      algo,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      m,
+                              int                      nrhs,
+                              int                      nnz,
+                              const double*            alpha,
+                              const cusparseMatDescr_t descrA,
+                              const double*            csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              const double*            B,
+                              int                      ldb,
+                              csrsm2Info_t             info,
+                              cusparseSolvePolicy_t    policy,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              int                      algo,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      m,
+                              int                      nrhs,
+                              int                      nnz,
+                              const cuComplex*         alpha,
+                              const cusparseMatDescr_t descrA,
+                              const cuComplex*         csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              const cuComplex*         B,
+                              int                      ldb,
+                              csrsm2Info_t             info,
+                              cusparseSolvePolicy_t    policy,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              int                      algo,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      m,
+                              int                      nrhs,
+                              int                      nnz,
+                              const cuDoubleComplex*   alpha,
+                              const cusparseMatDescr_t descrA,
+                              const cuDoubleComplex*   csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              const cuDoubleComplex*   B,
+                              int                      ldb,
+                              csrsm2Info_t             info,
+                              cusparseSolvePolicy_t    policy,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsm2_analysis(cusparseHandle_t         handle,
+                         int                      algo,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transB,
+                         int                      m,
+                         int                      nrhs,
+                         int                      nnz,
+                         const float*             alpha,
+                         const cusparseMatDescr_t descrA,
+                         const float*             csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const float*             B,
+                         int                      ldb,
+                         csrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsm2_analysis(cusparseHandle_t         handle,
+                         int                      algo,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transB,
+                         int                      m,
+                         int                      nrhs,
+                         int                      nnz,
+                         const double*            alpha,
+                         const cusparseMatDescr_t descrA,
+                         const double*            csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const double*            B,
+                         int                      ldb,
+                         csrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsm2_analysis(cusparseHandle_t         handle,
+                         int                      algo,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transB,
+                         int                      m,
+                         int                      nrhs,
+                         int                      nnz,
+                         const cuComplex*         alpha,
+                         const cusparseMatDescr_t descrA,
+                         const cuComplex*         csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const cuComplex*         B,
+                         int                      ldb,
+                         csrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsm2_analysis(cusparseHandle_t         handle,
+                         int                      algo,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transB,
+                         int                      m,
+                         int                      nrhs,
+                         int                      nnz,
+                         const cuDoubleComplex*   alpha,
+                         const cusparseMatDescr_t descrA,
+                         const cuDoubleComplex*   csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const cuDoubleComplex*   B,
+                         int                      ldb,
+                         csrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrsm2_solve(cusparseHandle_t         handle,
+                      int                      algo,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transB,
+                      int                      m,
+                      int                      nrhs,
+                      int                      nnz,
+                      const float*             alpha,
+                      const cusparseMatDescr_t descrA,
+                      const float*             csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      float*                   B,
+                      int                      ldb,
+                      csrsm2Info_t             info,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrsm2_solve(cusparseHandle_t         handle,
+                      int                      algo,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transB,
+                      int                      m,
+                      int                      nrhs,
+                      int                      nnz,
+                      const double*            alpha,
+                      const cusparseMatDescr_t descrA,
+                      const double*            csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      double*                  B,
+                      int                      ldb,
+                      csrsm2Info_t             info,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrsm2_solve(cusparseHandle_t         handle,
+                      int                      algo,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transB,
+                      int                      m,
+                      int                      nrhs,
+                      int                      nnz,
+                      const cuComplex*         alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuComplex*         csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      cuComplex*               B,
+                      int                      ldb,
+                      csrsm2Info_t             info,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpSM)
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrsm2_solve(cusparseHandle_t         handle,
+                      int                      algo,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transB,
+                      int                      m,
+                      int                      nrhs,
+                      int                      nnz,
+                      const cuDoubleComplex*   alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuDoubleComplex*   csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      cuDoubleComplex*         B,
+                      int                      ldb,
+                      csrsm2Info_t             info,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
+                          bsrsm2Info_t     info,
+                          int*             position);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsm2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           cusparseOperation_t      transXY,
+                           int                      mb,
+                           int                      n,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           float*                   bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockSize,
+                           bsrsm2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsm2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           cusparseOperation_t      transXY,
+                           int                      mb,
+                           int                      n,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           double*                  bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockSize,
+                           bsrsm2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsm2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           cusparseOperation_t      transXY,
+                           int                      mb,
+                           int                      n,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuComplex*               bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockSize,
+                           bsrsm2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsm2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           cusparseOperation_t      transXY,
+                           int                      mb,
+                           int                      n,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuDoubleComplex*         bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockSize,
+                           bsrsm2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      mb,
+                              int                      n,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              float*                   bsrSortedVal,
+                              const int*               bsrSortedRowPtr,
+                              const int*               bsrSortedColInd,
+                              int                      blockSize,
+                              bsrsm2Info_t             info,
+                              size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      mb,
+                              int                      n,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              double*                  bsrSortedVal,
+                              const int*               bsrSortedRowPtr,
+                              const int*               bsrSortedColInd,
+                              int                      blockSize,
+                              bsrsm2Info_t             info,
+                              size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      mb,
+                              int                      n,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              cuComplex*               bsrSortedVal,
+                              const int*               bsrSortedRowPtr,
+                              const int*               bsrSortedColInd,
+                              int                      blockSize,
+                              bsrsm2Info_t             info,
+                              size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      mb,
+                              int                      n,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              cuDoubleComplex*         bsrSortedVal,
+                              const int*               bsrSortedRowPtr,
+                              const int*               bsrSortedColInd,
+                              int                      blockSize,
+                              bsrsm2Info_t             info,
+                              size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsm2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transXY,
+                         int                      mb,
+                         int                      n,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const float*             bsrSortedVal,
+                         const int*               bsrSortedRowPtr,
+                         const int*               bsrSortedColInd,
+                         int                      blockSize,
+                         bsrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsm2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transXY,
+                         int                      mb,
+                         int                      n,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const double*            bsrSortedVal,
+                         const int*               bsrSortedRowPtr,
+                         const int*               bsrSortedColInd,
+                         int                      blockSize,
+                         bsrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsm2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transXY,
+                         int                      mb,
+                         int                      n,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const cuComplex*         bsrSortedVal,
+                         const int*               bsrSortedRowPtr,
+                         const int*               bsrSortedColInd,
+                         int                      blockSize,
+                         bsrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsm2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transXY,
+                         int                      mb,
+                         int                      n,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const cuDoubleComplex*   bsrSortedVal,
+                         const int*               bsrSortedRowPtr,
+                         const int*               bsrSortedColInd,
+                         int                      blockSize,
+                         bsrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsm2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transXY,
+                      int                      mb,
+                      int                      n,
+                      int                      nnzb,
+                      const float*             alpha,
+                      const cusparseMatDescr_t descrA,
+                      const float*             bsrSortedVal,
+                      const int*               bsrSortedRowPtr,
+                      const int*               bsrSortedColInd,
+                      int                      blockSize,
+                      bsrsm2Info_t             info,
+                      const float*             B,
+                      int                      ldb,
+                      float*                   X,
+                      int                      ldx,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsm2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transXY,
+                      int                      mb,
+                      int                      n,
+                      int                      nnzb,
+                      const double*            alpha,
+                      const cusparseMatDescr_t descrA,
+                      const double*            bsrSortedVal,
+                      const int*               bsrSortedRowPtr,
+                      const int*               bsrSortedColInd,
+                      int                      blockSize,
+                      bsrsm2Info_t             info,
+                      const double*            B,
+                      int                      ldb,
+                      double*                  X,
+                      int                      ldx,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsm2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transXY,
+                      int                      mb,
+                      int                      n,
+                      int                      nnzb,
+                      const cuComplex*         alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuComplex*         bsrSortedVal,
+                      const int*               bsrSortedRowPtr,
+                      const int*               bsrSortedColInd,
+                      int                      blockSize,
+                      bsrsm2Info_t             info,
+                      const cuComplex*         B,
+                      int                      ldb,
+                      cuComplex*               X,
+                      int                      ldx,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsm2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transXY,
+                      int                      mb,
+                      int                      n,
+                      int                      nnzb,
+                      const cuDoubleComplex*   alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuDoubleComplex*   bsrSortedVal,
+                      const int*               bsrSortedRowPtr,
+                      const int*               bsrSortedColInd,
+                      int                      blockSize,
+                      bsrsm2Info_t             info,
+                      const cuDoubleComplex*   B,
+                      int                      ldb,
+                      cuDoubleComplex*         X,
+                      int                      ldx,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+//##############################################################################
+//# PRECONDITIONERS
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02_numericBoost(cusparseHandle_t handle,
+                               csrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               float*           boost_val);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02_numericBoost(cusparseHandle_t handle,
+                               csrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               double*          boost_val);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02_numericBoost(cusparseHandle_t handle,
+                               csrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               cuComplex*       boost_val);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02_numericBoost(cusparseHandle_t handle,
+                               csrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               cuDoubleComplex* boost_val);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrilu02_zeroPivot(cusparseHandle_t handle,
+                            csrilu02Info_t   info,
+                            int*             position);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02_bufferSize(cusparseHandle_t         handle,
+                             int                      m,
+                             int                      nnz,
+                             const cusparseMatDescr_t descrA,
+                             float*                   csrSortedValA,
+                             const int*               csrSortedRowPtrA,
+                             const int*               csrSortedColIndA,
+                             csrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02_bufferSize(cusparseHandle_t         handle,
+                             int                      m,
+                             int                      nnz,
+                             const cusparseMatDescr_t descrA,
+                             double*                  csrSortedValA,
+                             const int*               csrSortedRowPtrA,
+                             const int*               csrSortedColIndA,
+                             csrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02_bufferSize(cusparseHandle_t         handle,
+                             int                      m,
+                             int                      nnz,
+                             const cusparseMatDescr_t descrA,
+                             cuComplex*               csrSortedValA,
+                             const int*               csrSortedRowPtrA,
+                             const int*               csrSortedColIndA,
+                             csrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02_bufferSize(cusparseHandle_t         handle,
+                             int                      m,
+                             int                      nnz,
+                             const cusparseMatDescr_t descrA,
+                             cuDoubleComplex*         csrSortedValA,
+                             const int*               csrSortedRowPtrA,
+                             const int*               csrSortedColIndA,
+                             csrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      nnz,
+                                const cusparseMatDescr_t descrA,
+                                float*                   csrSortedVal,
+                                const int*               csrSortedRowPtr,
+                                const int*               csrSortedColInd,
+                                csrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      nnz,
+                                const cusparseMatDescr_t descrA,
+                                double*                  csrSortedVal,
+                                const int*               csrSortedRowPtr,
+                                const int*               csrSortedColInd,
+                                csrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      nnz,
+                                const cusparseMatDescr_t descrA,
+                                cuComplex*               csrSortedVal,
+                                const int*               csrSortedRowPtr,
+                                const int*               csrSortedColInd,
+                                csrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      nnz,
+                                const cusparseMatDescr_t descrA,
+                                cuDoubleComplex*         csrSortedVal,
+                                const int*               csrSortedRowPtr,
+                                const int*               csrSortedColInd,
+                                csrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02_analysis(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           const float*             csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02_analysis(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           const double*            csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02_analysis(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           const cuComplex*         csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02_analysis(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           const cuDoubleComplex*   csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  float*                   csrSortedValA_valM,
+                  const int*            csrSortedRowPtrA,
+                  const int*            csrSortedColIndA,
+                  csrilu02Info_t        info,
+                  cusparseSolvePolicy_t policy,
+                  void*                 pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  double*                  csrSortedValA_valM,
+                  const int*            csrSortedRowPtrA,
+                  const int*            csrSortedColIndA,
+                  csrilu02Info_t        info,
+                  cusparseSolvePolicy_t policy,
+                  void*                 pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuComplex*               csrSortedValA_valM,
+                  const int*            csrSortedRowPtrA,
+                  const int*            csrSortedColIndA,
+                  csrilu02Info_t        info,
+                  cusparseSolvePolicy_t policy,
+                  void*                 pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuDoubleComplex*         csrSortedValA_valM,
+                  const int*            csrSortedRowPtrA,
+                  const int*            csrSortedColIndA,
+                  csrilu02Info_t        info,
+                  cusparseSolvePolicy_t policy,
+                  void*                 pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02_numericBoost(cusparseHandle_t handle,
+                               bsrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               float*           boost_val);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02_numericBoost(cusparseHandle_t handle,
+                               bsrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               double*          boost_val);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02_numericBoost(cusparseHandle_t handle,
+                               bsrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               cuComplex*       boost_val);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02_numericBoost(cusparseHandle_t handle,
+                               bsrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               cuDoubleComplex* boost_val);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXbsrilu02_zeroPivot(cusparseHandle_t handle,
+                            bsrilu02Info_t   info,
+                            int*             position);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02_bufferSize(cusparseHandle_t         handle,
+                             cusparseDirection_t      dirA,
+                             int                      mb,
+                             int                      nnzb,
+                             const cusparseMatDescr_t descrA,
+                             float*                   bsrSortedVal,
+                             const int*               bsrSortedRowPtr,
+                             const int*               bsrSortedColInd,
+                             int                      blockDim,
+                             bsrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02_bufferSize(cusparseHandle_t         handle,
+                             cusparseDirection_t      dirA,
+                             int                      mb,
+                             int                      nnzb,
+                             const cusparseMatDescr_t descrA,
+                             double*                  bsrSortedVal,
+                             const int*               bsrSortedRowPtr,
+                             const int*               bsrSortedColInd,
+                             int                      blockDim,
+                             bsrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02_bufferSize(cusparseHandle_t         handle,
+                             cusparseDirection_t      dirA,
+                             int                      mb,
+                             int                      nnzb,
+                             const cusparseMatDescr_t descrA,
+                             cuComplex*               bsrSortedVal,
+                             const int*               bsrSortedRowPtr,
+                             const int*               bsrSortedColInd,
+                             int                      blockDim,
+                             bsrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02_bufferSize(cusparseHandle_t         handle,
+                             cusparseDirection_t      dirA,
+                             int                      mb,
+                             int                      nnzb,
+                             const cusparseMatDescr_t descrA,
+                             cuDoubleComplex*         bsrSortedVal,
+                             const int*               bsrSortedRowPtr,
+                             const int*               bsrSortedColInd,
+                             int                      blockDim,
+                             bsrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                float*                   bsrSortedVal,
+                                const int*               bsrSortedRowPtr,
+                                const int*               bsrSortedColInd,
+                                int                      blockSize,
+                                bsrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                double*                  bsrSortedVal,
+                                const int*               bsrSortedRowPtr,
+                                const int*               bsrSortedColInd,
+                                int                      blockSize,
+                                bsrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                cuComplex*               bsrSortedVal,
+                                const int*               bsrSortedRowPtr,
+                                const int*               bsrSortedColInd,
+                                int                      blockSize,
+                                bsrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               cuDoubleComplex*         bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsrilu02Info_t           info,
+                               size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02_analysis(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           float*                   bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockDim,
+                           bsrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02_analysis(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           double*                  bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockDim,
+                           bsrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02_analysis(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuComplex*               bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockDim,
+                           bsrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02_analysis(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuDoubleComplex*         bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockDim,
+                           bsrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02(cusparseHandle_t         handle,
+                  cusparseDirection_t      dirA,
+                  int                      mb,
+                  int                      nnzb,
+                  const cusparseMatDescr_t descrA,
+                  float*                   bsrSortedVal,
+                  const int*               bsrSortedRowPtr,
+                  const int*               bsrSortedColInd,
+                  int                      blockDim,
+                  bsrilu02Info_t           info,
+                  cusparseSolvePolicy_t    policy,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02(cusparseHandle_t         handle,
+                  cusparseDirection_t      dirA,
+                  int                      mb,
+                  int                      nnzb,
+                  const cusparseMatDescr_t descrA,
+                  double*                  bsrSortedVal,
+                  const int*               bsrSortedRowPtr,
+                  const int*               bsrSortedColInd,
+                  int                      blockDim,
+                  bsrilu02Info_t           info,
+                  cusparseSolvePolicy_t    policy,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02(cusparseHandle_t         handle,
+                  cusparseDirection_t      dirA,
+                  int                      mb,
+                  int                      nnzb,
+                  const cusparseMatDescr_t descrA,
+                  cuComplex*               bsrSortedVal,
+                  const int*               bsrSortedRowPtr,
+                  const int*               bsrSortedColInd,
+                  int                      blockDim,
+                  bsrilu02Info_t           info,
+                  cusparseSolvePolicy_t    policy,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02(cusparseHandle_t         handle,
+                  cusparseDirection_t      dirA,
+                  int                      mb,
+                  int                      nnzb,
+                  const cusparseMatDescr_t descrA,
+                  cuDoubleComplex*         bsrSortedVal,
+                  const int*               bsrSortedRowPtr,
+                  const int*               bsrSortedColInd,
+                  int                      blockDim,
+                  bsrilu02Info_t           info,
+                  cusparseSolvePolicy_t    policy,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
+                           csric02Info_t    info,
+                           int*             position);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsric02_bufferSize(cusparseHandle_t         handle,
+                            int                      m,
+                            int                      nnz,
+                            const cusparseMatDescr_t descrA,
+                            float*                   csrSortedValA,
+                            const int*               csrSortedRowPtrA,
+                            const int*               csrSortedColIndA,
+                            csric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsric02_bufferSize(cusparseHandle_t         handle,
+                            int                      m,
+                            int                      nnz,
+                            const cusparseMatDescr_t descrA,
+                            double*                  csrSortedValA,
+                            const int*               csrSortedRowPtrA,
+                            const int*               csrSortedColIndA,
+                            csric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsric02_bufferSize(cusparseHandle_t         handle,
+                            int                      m,
+                            int                      nnz,
+                            const cusparseMatDescr_t descrA,
+                            cuComplex*               csrSortedValA,
+                            const int*               csrSortedRowPtrA,
+                            const int*               csrSortedColIndA,
+                            csric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsric02_bufferSize(cusparseHandle_t         handle,
+                            int                      m,
+                            int                      nnz,
+                            const cusparseMatDescr_t descrA,
+                            cuDoubleComplex*         csrSortedValA,
+                            const int*               csrSortedRowPtrA,
+                            const int*               csrSortedColIndA,
+                            csric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               int                      m,
+                               int                      nnz,
+                               const cusparseMatDescr_t descrA,
+                               float*                   csrSortedVal,
+                               const int*               csrSortedRowPtr,
+                               const int*               csrSortedColInd,
+                               csric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               int                      m,
+                               int                      nnz,
+                               const cusparseMatDescr_t descrA,
+                               double*                  csrSortedVal,
+                               const int*               csrSortedRowPtr,
+                               const int*               csrSortedColInd,
+                               csric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               int                      m,
+                               int                      nnz,
+                               const cusparseMatDescr_t descrA,
+                               cuComplex*               csrSortedVal,
+                               const int*               csrSortedRowPtr,
+                               const int*               csrSortedColInd,
+                               csric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               int                      m,
+                               int                      nnz,
+                               const cusparseMatDescr_t descrA,
+                               cuDoubleComplex*         csrSortedVal,
+                               const int*               csrSortedRowPtr,
+                               const int*               csrSortedColInd,
+                               csric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsric02_analysis(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      nnz,
+                          const cusparseMatDescr_t descrA,
+                          const float*             csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          csric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsric02_analysis(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      nnz,
+                          const cusparseMatDescr_t descrA,
+                          const double*            csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          csric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsric02_analysis(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      nnz,
+                          const cusparseMatDescr_t descrA,
+                          const cuComplex*         csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          csric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsric02_analysis(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      nnz,
+                          const cusparseMatDescr_t descrA,
+                          const cuDoubleComplex*   csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          csric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsric02(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 float*                   csrSortedValA_valM,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 csric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsric02(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 double*                  csrSortedValA_valM,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 csric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsric02(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 cuComplex*               csrSortedValA_valM,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 csric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsric02(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 cuDoubleComplex*         csrSortedValA_valM,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 csric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
+                           bsric02Info_t    info,
+                           int*             position);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsric02_bufferSize(cusparseHandle_t         handle,
+                            cusparseDirection_t      dirA,
+                            int                      mb,
+                            int                      nnzb,
+                            const cusparseMatDescr_t descrA,
+                            float*                   bsrSortedVal,
+                            const int*               bsrSortedRowPtr,
+                            const int*               bsrSortedColInd,
+                            int                      blockDim,
+                            bsric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsric02_bufferSize(cusparseHandle_t         handle,
+                            cusparseDirection_t      dirA,
+                            int                      mb,
+                            int                      nnzb,
+                            const cusparseMatDescr_t descrA,
+                            double*                  bsrSortedVal,
+                            const int*               bsrSortedRowPtr,
+                            const int*               bsrSortedColInd,
+                            int                      blockDim,
+                            bsric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsric02_bufferSize(cusparseHandle_t         handle,
+                            cusparseDirection_t      dirA,
+                            int                      mb,
+                            int                      nnzb,
+                            const cusparseMatDescr_t descrA,
+                            cuComplex*               bsrSortedVal,
+                            const int*               bsrSortedRowPtr,
+                            const int*               bsrSortedColInd,
+                            int                      blockDim,
+                            bsric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsric02_bufferSize(cusparseHandle_t         handle,
+                            cusparseDirection_t      dirA,
+                            int                      mb,
+                            int                      nnzb,
+                            const cusparseMatDescr_t descrA,
+                            cuDoubleComplex*         bsrSortedVal,
+                            const int*               bsrSortedRowPtr,
+                            const int*               bsrSortedColInd,
+                            int                      blockDim,
+                            bsric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               float*                   bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               double*                  bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               cuComplex*               bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               cuDoubleComplex*         bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsric02_analysis(cusparseHandle_t         handle,
+                          cusparseDirection_t      dirA,
+                          int                      mb,
+                          int                      nnzb,
+                          const cusparseMatDescr_t descrA,
+                          const float*             bsrSortedVal,
+                          const int*               bsrSortedRowPtr,
+                          const int*               bsrSortedColInd,
+                          int                      blockDim,
+                          bsric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pInputBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsric02_analysis(cusparseHandle_t         handle,
+                          cusparseDirection_t      dirA,
+                          int                      mb,
+                          int                      nnzb,
+                          const cusparseMatDescr_t descrA,
+                          const double*            bsrSortedVal,
+                          const int*               bsrSortedRowPtr,
+                          const int*               bsrSortedColInd,
+                          int                      blockDim,
+                          bsric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pInputBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsric02_analysis(cusparseHandle_t         handle,
+                          cusparseDirection_t      dirA,
+                          int                      mb,
+                          int                      nnzb,
+                          const cusparseMatDescr_t descrA,
+                          const cuComplex*         bsrSortedVal,
+                          const int*               bsrSortedRowPtr,
+                          const int*               bsrSortedColInd,
+                          int                      blockDim,
+                          bsric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pInputBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsric02_analysis(cusparseHandle_t         handle,
+                          cusparseDirection_t      dirA,
+                          int                      mb,
+                          int                      nnzb,
+                          const cusparseMatDescr_t descrA,
+                          const cuDoubleComplex*   bsrSortedVal,
+                          const int*               bsrSortedRowPtr,
+                          const int*               bsrSortedColInd,
+                          int                      blockDim,
+                          bsric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pInputBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsric02(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nnzb,
+                 const cusparseMatDescr_t descrA,
+                 float*                   bsrSortedVal,
+                 const int*               bsrSortedRowPtr,
+                 const int*               bsrSortedColInd,
+                 int                      blockDim,
+                 bsric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsric02(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nnzb,
+                 const cusparseMatDescr_t descrA,
+                 double*                  bsrSortedVal,
+                 const int*               bsrSortedRowPtr,
+                 const int*               bsrSortedColInd,
+                 int                      blockDim,
+                 bsric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsric02(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nnzb,
+                 const cusparseMatDescr_t descrA,
+                 cuComplex*               bsrSortedVal,
+                 const int*               bsrSortedRowPtr,
+                 const int*
+                      bsrSortedColInd,
+                 int                      blockDim,
+                 bsric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsric02(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nnzb,
+                 const cusparseMatDescr_t descrA,
+                 cuDoubleComplex*         bsrSortedVal,
+                 const int*               bsrSortedRowPtr,
+                 const int*               bsrSortedColInd,
+                 int                      blockDim,
+                 bsric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2_bufferSizeExt(cusparseHandle_t handle,
+                             int              m,
+                             int              n,
+                             const float*     dl,
+                             const float*     d,
+                             const float*     du,
+                             const float*     B,
+                             int              ldb,
+                             size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2_bufferSizeExt(cusparseHandle_t handle,
+                             int              m,
+                             int              n,
+                             const double*    dl,
+                             const double*    d,
+                             const double*    du,
+                             const double*    B,
+                             int              ldb,
+                             size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2_bufferSizeExt(cusparseHandle_t handle,
+                             int              m,
+                             int              n,
+                             const cuComplex* dl,
+                             const cuComplex* d,
+                             const cuComplex* du,
+                             const cuComplex* B,
+                             int              ldb,
+                             size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2_bufferSizeExt(cusparseHandle_t       handle,
+                             int                    m,
+                             int                    n,
+                             const cuDoubleComplex* dl,
+                             const cuDoubleComplex* d,
+                             const cuDoubleComplex* du,
+                             const cuDoubleComplex* B,
+                             int                    ldb,
+                             size_t*                bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               const float*     dl,
+               const float*     d,
+               const float*     du,
+               float*           B,
+               int              ldb,
+               void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               const double*    dl,
+               const double*    d,
+               const double*    du,
+               double*          B,
+               int              ldb,
+               void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               const cuComplex* dl,
+               const cuComplex* d,
+               const cuComplex* du,
+               cuComplex*       B,
+               int              ldb,
+               void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2(cusparseHandle_t       handle,
+               int                    m,
+               int                    n,
+               const cuDoubleComplex* dl,
+               const cuDoubleComplex* d,
+               const cuDoubleComplex* du,
+               cuDoubleComplex*       B,
+               int                    ldb,
+               void*                  pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle,
+                                     int              m,
+                                     int              n,
+                                     const float*     dl,
+                                     const float*     d,
+                                     const float*     du,
+                                     const float*     B,
+                                     int              ldb,
+                                     size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle,
+                                     int              m,
+                                     int              n,
+                                     const double*    dl,
+                                     const double*    d,
+                                     const double*    du,
+                                     const double*    B,
+                                     int              ldb,
+                                     size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle,
+                                     int              m,
+                                     int              n,
+                                     const cuComplex* dl,
+                                     const cuComplex* d,
+                                     const cuComplex* du,
+                                     const cuComplex* B,
+                                     int              ldb,
+                                     size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2_nopivot_bufferSizeExt(cusparseHandle_t       handle,
+                                     int                    m,
+                                     int                    n,
+                                     const cuDoubleComplex* dl,
+                                     const cuDoubleComplex* d,
+                                     const cuDoubleComplex* du,
+                                     const cuDoubleComplex* B,
+                                     int                    ldb,
+                                     size_t*                bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2_nopivot(cusparseHandle_t handle,
+                       int              m,
+                       int              n,
+                       const float*     dl,
+                       const float*     d,
+                       const float*     du,
+                       float*           B,
+                       int              ldb,
+                       void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2_nopivot(cusparseHandle_t handle,
+                       int              m,
+                       int              n,
+                       const double*    dl,
+                       const double*    d,
+                       const double*    du,
+                       double*          B,
+                       int              ldb,
+                       void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2_nopivot(cusparseHandle_t handle,
+                       int              m,
+                       int              n,
+                       const cuComplex* dl,
+                       const cuComplex* d,
+                       const cuComplex* du,
+                       cuComplex*       B,
+                       int              ldb,
+                       void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2_nopivot(cusparseHandle_t       handle,
+                       int                    m,
+                       int                    n,
+                       const cuDoubleComplex* dl,
+                       const cuDoubleComplex* d,
+                       const cuDoubleComplex* du,
+                       cuDoubleComplex*       B,
+                       int                    ldb,
+                       void*                  pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                         int              m,
+                                         const float*     dl,
+                                         const float*     d,
+                                         const float*     du,
+                                         const float*     x,
+                                         int              batchCount,
+                                         int              batchStride,
+                                         size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                         int              m,
+                                         const double*    dl,
+                                         const double*    d,
+                                         const double*    du,
+                                         const double*    x,
+                                         int              batchCount,
+                                         int              batchStride,
+                                         size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                         int              m,
+                                         const cuComplex* dl,
+                                         const cuComplex* d,
+                                         const cuComplex* du,
+                                         const cuComplex* x,
+                                         int              batchCount,
+                                         int              batchStride,
+                                         size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t       handle,
+                                         int                    m,
+                                         const cuDoubleComplex* dl,
+                                         const cuDoubleComplex* d,
+                                         const cuDoubleComplex* du,
+                                         const cuDoubleComplex* x,
+                                         int                    batchCount,
+                                         int                    batchStride,
+                                         size_t* bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2StridedBatch(cusparseHandle_t handle,
+                           int              m,
+                           const float*     dl,
+                           const float*     d,
+                           const float*     du,
+                           float*           x,
+                           int              batchCount,
+                           int              batchStride,
+                           void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2StridedBatch(cusparseHandle_t handle,
+                           int              m,
+                           const double*    dl,
+                           const double*    d,
+                           const double*    du,
+                           double*          x,
+                           int              batchCount,
+                           int              batchStride,
+                           void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2StridedBatch(cusparseHandle_t handle,
+                           int              m,
+                           const cuComplex* dl,
+                           const cuComplex* d,
+                           const cuComplex* du,
+                           cuComplex*       x,
+                           int              batchCount,
+                           int              batchStride,
+                           void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2StridedBatch(cusparseHandle_t       handle,
+                           int                    m,
+                           const cuDoubleComplex* dl,
+                           const cuDoubleComplex* d,
+                           const cuDoubleComplex* du,
+                           cuDoubleComplex*       x,
+                           int                    batchCount,
+                           int                    batchStride,
+                           void*                  pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const float*     dl,
+                                            const float*     d,
+                                            const float*     du,
+                                            const float*     x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                         int              algo,
+                                         int              m,
+                                         const double*    dl,
+                                         const double*    d,
+                                         const double*    du,
+                                         const double*    x,
+                                         int              batchCount,
+                                         size_t*          pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const cuComplex* dl,
+                                            const cuComplex* d,
+                                            const cuComplex* du,
+                                            const cuComplex* x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t       handle,
+                                            int                    algo,
+                                            int                    m,
+                                            const cuDoubleComplex* dl,
+                                            const cuDoubleComplex* d,
+                                            const cuDoubleComplex* du,
+                                            const cuDoubleComplex* x,
+                                            int                    batchCount,
+                                            size_t*        pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              float*           dl,
+                              float*           d,
+                              float*           du,
+                              float*           x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              double*          dl,
+                              double*          d,
+                              double*          du,
+                              double*          x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              cuComplex*       dl,
+                              cuComplex*       d,
+                              cuComplex*       du,
+                              cuComplex*       x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              cuDoubleComplex* dl,
+                              cuDoubleComplex* d,
+                              cuDoubleComplex* du,
+                              cuDoubleComplex* x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const float*     ds,
+                                            const float*     dl,
+                                            const float*     d,
+                                            const float*     du,
+                                            const float*     dw,
+                                            const float*     x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const double*    ds,
+                                            const double*    dl,
+                                            const double*    d,
+                                            const double*    du,
+                                            const double*    dw,
+                                            const double*    x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const cuComplex* ds,
+                                            const cuComplex* dl,
+                                            const cuComplex* d,
+                                            const cuComplex* du,
+                                            const cuComplex* dw,
+                                            const cuComplex* x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t       handle,
+                                            int                    algo,
+                                            int                    m,
+                                            const cuDoubleComplex* ds,
+                                            const cuDoubleComplex* dl,
+                                            const cuDoubleComplex* d,
+                                            const cuDoubleComplex* du,
+                                            const cuDoubleComplex* dw,
+                                            const cuDoubleComplex* x,
+                                            int                    batchCount,
+                                            size_t*         pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgpsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              float*           ds,
+                              float*           dl,
+                              float*           d,
+                              float*           du,
+                              float*           dw,
+                              float*           x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgpsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              double*          ds,
+                              double*          dl,
+                              double*          d,
+                              double*          du,
+                              double*          dw,
+                              double*          x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgpsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              cuComplex*       ds,
+                              cuComplex*       dl,
+                              cuComplex*       d,
+                              cuComplex*       du,
+                              cuComplex*       dw,
+                              cuComplex*       x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgpsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              cuDoubleComplex* ds,
+                              cuDoubleComplex* dl,
+                              cuDoubleComplex* d,
+                              cuDoubleComplex* du,
+                              cuDoubleComplex* dw,
+                              cuDoubleComplex* x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+//##############################################################################
+//# EXTRA ROUTINES
+//##############################################################################
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsrgemm2Info(csrgemm2Info_t* info);
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsrgemm2Info(csrgemm2Info_t info);
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrgemm2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                int                      k,
+                                const float*             alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const float*             beta,
+                                const cusparseMatDescr_t descrD,
+                                int                      nnzD,
+                                const int*               csrSortedRowPtrD,
+                                const int*               csrSortedColIndD,
+                                csrgemm2Info_t           info,
+                                size_t*                  pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrgemm2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                int                      k,
+                                const double*            alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const double*            beta,
+                                const cusparseMatDescr_t descrD,
+                                int                      nnzD,
+                                const int*               csrSortedRowPtrD,
+                                const int*               csrSortedColIndD,
+                                csrgemm2Info_t           info,
+                                size_t*                  pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrgemm2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                int                      k,
+                                const cuComplex*         alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cuComplex*         beta,
+                                const cusparseMatDescr_t descrD,
+                                int                      nnzD,
+                                const int*               csrSortedRowPtrD,
+                                const int*               csrSortedColIndD,
+                                csrgemm2Info_t           info,
+                                size_t*                  pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrgemm2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                int                      k,
+                                const cuDoubleComplex*   alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cuDoubleComplex*   beta,
+                                const cusparseMatDescr_t descrD,
+                                int                      nnzD,
+                                const int*               csrSortedRowPtrD,
+                                const int*               csrSortedColIndD,
+                                csrgemm2Info_t           info,
+                                size_t*                  pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrgemm2Nnz(cusparseHandle_t         handle,
+                     int                      m,
+                     int                      n,
+                     int                      k,
+                     const cusparseMatDescr_t descrA,
+                     int                      nnzA,
+                     const int*               csrSortedRowPtrA,
+                     const int*               csrSortedColIndA,
+                     const cusparseMatDescr_t descrB,
+                     int                      nnzB,
+                     const int*               csrSortedRowPtrB,
+                     const int*               csrSortedColIndB,
+                     const cusparseMatDescr_t descrD,
+                     int                      nnzD,
+                     const int*               csrSortedRowPtrD,
+                     const int*               csrSortedColIndD,
+                     const cusparseMatDescr_t descrC,
+                     int*                     csrSortedRowPtrC,
+                     int*                     nnzTotalDevHostPtr,
+                     const csrgemm2Info_t     info,
+                     void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrgemm2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      k,
+                  const float*             alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const float*             csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const float*             csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const float*             beta,
+                  const cusparseMatDescr_t descrD,
+                  int                      nnzD,
+                  const float*             csrSortedValD,
+                  const int*               csrSortedRowPtrD,
+                  const int*               csrSortedColIndD,
+                  const cusparseMatDescr_t descrC,
+                  float*                   csrSortedValC,
+                  const int*               csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  const csrgemm2Info_t     info,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrgemm2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      k,
+                  const double*            alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const double*            csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const double*            csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const double*            beta,
+                  const cusparseMatDescr_t descrD,
+                  int                      nnzD,
+                  const double*            csrSortedValD,
+                  const int*               csrSortedRowPtrD,
+                  const int*               csrSortedColIndD,
+                  const cusparseMatDescr_t descrC,
+                  double*                  csrSortedValC,
+                  const int*               csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  const csrgemm2Info_t     info,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrgemm2(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      n,
+                 int                      k,
+                 const cuComplex*         alpha,
+                 const cusparseMatDescr_t descrA,
+                 int                      nnzA,
+                 const cuComplex*         csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 const cusparseMatDescr_t descrB,
+                 int                      nnzB,
+                 const cuComplex*         csrSortedValB,
+                 const int*               csrSortedRowPtrB,
+                 const int*               csrSortedColIndB,
+                 const cuComplex*         beta,
+                 const cusparseMatDescr_t descrD,
+                 int                      nnzD,
+                 const cuComplex*         csrSortedValD,
+                 const int*               csrSortedRowPtrD,
+                 const int*               csrSortedColIndD,
+                 const cusparseMatDescr_t descrC,
+                 cuComplex*               csrSortedValC,
+                 const int*               csrSortedRowPtrC,
+                 int*                     csrSortedColIndC,
+                 const csrgemm2Info_t     info,
+                 void*                    pBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSpGEMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrgemm2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      k,
+                  const cuDoubleComplex*   alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const cuDoubleComplex*   csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const cuDoubleComplex*   csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cuDoubleComplex*   beta,
+                  const cusparseMatDescr_t descrD,
+                  int                      nnzD,
+                  const cuDoubleComplex*   csrSortedValD,
+                  const int*               csrSortedRowPtrD,
+                  const int*               csrSortedColIndD,
+                  const cusparseMatDescr_t descrC,
+                  cuDoubleComplex*         csrSortedValC,
+                  const int*               csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  const csrgemm2Info_t     info,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrgeam2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                const float*             alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const float*             csrSortedValA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const float*             beta,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const float*             csrSortedValB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cusparseMatDescr_t descrC,
+                                const float*             csrSortedValC,
+                                const int*               csrSortedRowPtrC,
+                                const int*               csrSortedColIndC,
+                                size_t*                  pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrgeam2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                const double*            alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const double*            csrSortedValA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const double*            beta,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const double*            csrSortedValB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cusparseMatDescr_t descrC,
+                                const double*            csrSortedValC,
+                                const int*               csrSortedRowPtrC,
+                                const int*               csrSortedColIndC,
+                                size_t*                  pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrgeam2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                const cuComplex*         alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const cuComplex*         csrSortedValA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cuComplex*         beta,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const cuComplex*         csrSortedValB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cusparseMatDescr_t descrC,
+                                const cuComplex*         csrSortedValC,
+                                const int*               csrSortedRowPtrC,
+                                const int*               csrSortedColIndC,
+                                size_t*                  pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrgeam2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                const cuDoubleComplex*   alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const cuDoubleComplex*   csrSortedValA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cuDoubleComplex*   beta,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const cuDoubleComplex*   csrSortedValB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cusparseMatDescr_t descrC,
+                                const cuDoubleComplex*   csrSortedValC,
+                                const int*               csrSortedRowPtrC,
+                                const int*               csrSortedColIndC,
+                                size_t*                  pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrgeam2Nnz(cusparseHandle_t         handle,
+                     int                      m,
+                     int                      n,
+                     const cusparseMatDescr_t descrA,
+                     int                      nnzA,
+                     const int*               csrSortedRowPtrA,
+                     const int*               csrSortedColIndA,
+                     const cusparseMatDescr_t descrB,
+                     int                      nnzB,
+                     const int*               csrSortedRowPtrB,
+                     const int*               csrSortedColIndB,
+                     const cusparseMatDescr_t descrC,
+                     int*                     csrSortedRowPtrC,
+                     int*                     nnzTotalDevHostPtr,
+                     void*                    workspace);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrgeam2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  const float*             alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const float*             csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const float*             beta,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const float*             csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cusparseMatDescr_t descrC,
+                  float*                   csrSortedValC,
+                  int*                     csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrgeam2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  const double*            alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const double*            csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const double*            beta,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const double*            csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cusparseMatDescr_t descrC,
+                  double*                  csrSortedValC,
+                  int*                     csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrgeam2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  const cuComplex*         alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const cuComplex*         csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const cuComplex*         beta,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const cuComplex*         csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cusparseMatDescr_t descrC,
+                  cuComplex*               csrSortedValC,
+                  int*                     csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrgeam2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  const cuDoubleComplex*   alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const cuDoubleComplex*   csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const cuDoubleComplex*   beta,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const cuDoubleComplex*   csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cusparseMatDescr_t descrC,
+                  cuDoubleComplex*         csrSortedValC,
+                  int*                     csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  void*                    pBuffer);
+
+//##############################################################################
+//# SPARSE MATRIX REORDERING
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrcolor(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  const float*              csrSortedValA,
+                  const int*                csrSortedRowPtrA,
+                  const int*                csrSortedColIndA,
+                  const float*              fractionToColor,
+                  int*                      ncolors,
+                  int*                      coloring,
+                  int*                      reordering,
+                  const cusparseColorInfo_t info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrcolor(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  const double*            csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const double*            fractionToColor,
+                  int*                     ncolors,
+                  int*                     coloring,
+                  int*                     reordering,
+                  const cusparseColorInfo_t info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrcolor(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  const cuComplex*          csrSortedValA,
+                  const int*                csrSortedRowPtrA,
+                  const int*                csrSortedColIndA,
+                  const float*              fractionToColor,
+                  int*                      ncolors,
+                  int*                      coloring,
+                  int*                      reordering,
+                  const cusparseColorInfo_t info);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrcolor(cusparseHandle_t          handle,
+                  int                       m,
+                  int                       nnz,
+                  const cusparseMatDescr_t  descrA,
+                  const cuDoubleComplex*    csrSortedValA,
+                  const int*                csrSortedRowPtrA,
+                  const int*                csrSortedColIndA,
+                  const double*             fractionToColor,
+                  int*                      ncolors,
+                  int*                      coloring,
+                  int*                      reordering,
+                  const cusparseColorInfo_t info);
+
+//##############################################################################
+//# SPARSE FORMAT CONVERSION
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSnnz(cusparseHandle_t         handle,
+             cusparseDirection_t      dirA,
+             int                      m,
+             int                      n,
+             const cusparseMatDescr_t descrA,
+             const float*             A,
+             int                      lda,
+             int*                     nnzPerRowCol,
+             int*                     nnzTotalDevHostPtr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnnz(cusparseHandle_t         handle,
+             cusparseDirection_t      dirA,
+             int                      m,
+             int                      n,
+             const cusparseMatDescr_t descrA,
+             const double*            A,
+             int                      lda,
+             int*                     nnzPerRowCol,
+             int*                     nnzTotalDevHostPtr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCnnz(cusparseHandle_t         handle,
+             cusparseDirection_t      dirA,
+             int                      m,
+             int                      n,
+             const cusparseMatDescr_t descrA,
+             const cuComplex*         A,
+             int                      lda,
+             int*                     nnzPerRowCol,
+             int*                     nnzTotalDevHostPtr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZnnz(cusparseHandle_t         handle,
+             cusparseDirection_t      dirA,
+             int                      m,
+             int                      n,
+             const cusparseMatDescr_t descrA,
+             const cuDoubleComplex*   A,
+             int                      lda,
+             int*                     nnzPerRowCol,
+             int*                     nnzTotalDevHostPtr);
+
+//##############################################################################
+//# SPARSE FORMAT CONVERSION
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSnnz_compress(cusparseHandle_t         handle,
+                      int                      m,
+                      const cusparseMatDescr_t descr,
+                      const float*             csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      int*                     nnzPerRow,
+                      int*                     nnzC,
+                      float                    tol);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnnz_compress(cusparseHandle_t         handle,
+                      int                      m,
+                      const cusparseMatDescr_t descr,
+                      const double*            csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      int*                     nnzPerRow,
+                      int*                     nnzC,
+                      double                   tol);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCnnz_compress(cusparseHandle_t         handle,
+                      int                      m,
+                      const cusparseMatDescr_t descr,
+                      const cuComplex*         csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      int*                     nnzPerRow,
+                      int*                     nnzC,
+                      cuComplex                tol);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZnnz_compress(cusparseHandle_t         handle,
+                      int                      m,
+                      const cusparseMatDescr_t descr,
+                      const cuDoubleComplex*   csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      int*                     nnzPerRow,
+                      int*                     nnzC,
+                      cuDoubleComplex          tol);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2csr_compress(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          const cusparseMatDescr_t descrA,
+                          const float*             csrSortedValA,
+                          const int*               csrSortedColIndA,
+                          const int*               csrSortedRowPtrA,
+                          int                      nnzA,
+                          const int*               nnzPerRow,
+                          float*                   csrSortedValC,
+                          int*                     csrSortedColIndC,
+                          int*                     csrSortedRowPtrC,
+                          float                    tol);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2csr_compress(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          const cusparseMatDescr_t descrA,
+                          const double*            csrSortedValA,
+                          const int*               csrSortedColIndA,
+                          const int*               csrSortedRowPtrA,
+                          int                      nnzA,
+                          const int*               nnzPerRow,
+                          double*                  csrSortedValC,
+                          int*                     csrSortedColIndC,
+                          int*                     csrSortedRowPtrC,
+                          double                   tol);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2csr_compress(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          const cusparseMatDescr_t descrA,
+                          const cuComplex*         csrSortedValA,
+                          const int*               csrSortedColIndA,
+                          const int*               csrSortedRowPtrA,
+                          int                      nnzA,
+                          const int*               nnzPerRow,
+                          cuComplex*               csrSortedValC,
+                          int*                     csrSortedColIndC,
+                          int*                     csrSortedRowPtrC,
+                          cuComplex                tol);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2csr_compress(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          const cusparseMatDescr_t descrA,
+                          const cuDoubleComplex*   csrSortedValA,
+                          const int*               csrSortedColIndA,
+                          const int*               csrSortedRowPtrA,
+                          int                      nnzA,
+                          const int*               nnzPerRow,
+                          cuDoubleComplex*         csrSortedValC,
+                          int*                     csrSortedColIndC,
+                          int*                     csrSortedRowPtrC,
+                          cuDoubleComplex          tol);
+
+CUSPARSE_DEPRECATED(cusparseDenseToSparse)
+cusparseStatus_t CUSPARSEAPI
+cusparseSdense2csr(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const float*             A,
+                   int                      lda,
+                   const int*               nnzPerRow,
+                   float*                   csrSortedValA,
+                   int*                     csrSortedRowPtrA,
+                   int*                     csrSortedColIndA);
+
+CUSPARSE_DEPRECATED(cusparseDenseToSparse)
+cusparseStatus_t CUSPARSEAPI
+cusparseDdense2csr(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const double*            A,
+                   int                      lda,
+                   const int*               nnzPerRow,
+                   double*                  csrSortedValA,
+                   int*                     csrSortedRowPtrA,
+                   int*                     csrSortedColIndA);
+
+CUSPARSE_DEPRECATED(cusparseDenseToSparse)
+cusparseStatus_t CUSPARSEAPI
+cusparseCdense2csr(cusparseHandle_t           handle,
+                     int                      m,
+                     int                      n,
+                     const cusparseMatDescr_t descrA,
+                     const cuComplex*         A,
+                     int                      lda,
+                     const int*               nnzPerRow,
+                     cuComplex*               csrSortedValA,
+                     int*                     csrSortedRowPtrA,
+                     int*                     csrSortedColIndA);
+
+CUSPARSE_DEPRECATED(cusparseDenseToSparse)
+cusparseStatus_t CUSPARSEAPI
+cusparseZdense2csr(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuDoubleComplex*   A,
+                   int                      lda,
+                   const int*               nnzPerRow,
+                   cuDoubleComplex*         csrSortedValA,
+                   int*                     csrSortedRowPtrA,
+                   int*                     csrSortedColIndA);
+
+CUSPARSE_DEPRECATED(cusparseSparseToDense)
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const float*             csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   float*                   A,
+                   int                      lda);
+
+CUSPARSE_DEPRECATED(cusparseSparseToDense)
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const double*            csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   double*                  A,
+                   int                      lda);
+
+CUSPARSE_DEPRECATED(cusparseSparseToDense)
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuComplex*         csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   cuComplex*               A,
+                   int                      lda);
+
+CUSPARSE_DEPRECATED(cusparseSparseToDense)
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2dense(cusparseHandle_t         handle,
+                int                      m,
+                int                      n,
+                const cusparseMatDescr_t descrA,
+                const cuDoubleComplex*   csrSortedValA,
+                const int*               csrSortedRowPtrA,
+                const int*               csrSortedColIndA,
+                cuDoubleComplex*         A,
+                int                      lda);
+
+CUSPARSE_DEPRECATED(cusparseDenseToSparse)
+cusparseStatus_t CUSPARSEAPI
+cusparseSdense2csc(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const float*             A,
+                   int                      lda,
+                   const int*               nnzPerCol,
+                   float*                   cscSortedValA,
+                   int*                     cscSortedRowIndA,
+                   int*                     cscSortedColPtrA);
+
+CUSPARSE_DEPRECATED(cusparseDenseToSparse)
+cusparseStatus_t CUSPARSEAPI
+cusparseDdense2csc(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const double*            A,
+                   int                      lda,
+                   const int*               nnzPerCol,
+                   double*                  cscSortedValA,
+                   int*                     cscSortedRowIndA,
+                   int*                     cscSortedColPtrA);
+
+CUSPARSE_DEPRECATED(cusparseDenseToSparse)
+cusparseStatus_t CUSPARSEAPI
+cusparseCdense2csc(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuComplex*         A,
+                   int                      lda,
+                   const int*               nnzPerCol,
+                   cuComplex*               cscSortedValA,
+                   int*                     cscSortedRowIndA,
+                   int*                     cscSortedColPtrA);
+
+CUSPARSE_DEPRECATED(cusparseDenseToSparse)
+cusparseStatus_t CUSPARSEAPI
+cusparseZdense2csc(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuDoubleComplex*   A,
+                   int                      lda,
+                   const int*               nnzPerCol,
+                   cuDoubleComplex*         cscSortedValA,
+                   int*                     cscSortedRowIndA,
+                   int*                     cscSortedColPtrA);
+
+CUSPARSE_DEPRECATED(cusparseSparseToDense)
+cusparseStatus_t CUSPARSEAPI
+cusparseScsc2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const float*             cscSortedValA,
+                   const int*               cscSortedRowIndA,
+                   const int*               cscSortedColPtrA,
+                   float*                   A,
+                   int                      lda);
+
+CUSPARSE_DEPRECATED(cusparseSparseToDense)
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsc2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const double*            cscSortedValA,
+                   const int*               cscSortedRowIndA,
+                   const int*               cscSortedColPtrA,
+                   double*                  A,
+                   int                      lda);
+
+CUSPARSE_DEPRECATED(cusparseSparseToDense)
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsc2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuComplex*         cscSortedValA,
+                   const int*               cscSortedRowIndA,
+                   const int*               cscSortedColPtrA,
+                   cuComplex*               A,
+                   int                      lda);
+
+CUSPARSE_DEPRECATED(cusparseSparseToDense)
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsc2dense(cusparseHandle_t         handle,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuDoubleComplex*   cscSortedValA,
+                   const int*               cscSortedRowIndA,
+                   const int*               cscSortedColPtrA,
+                   cuDoubleComplex*         A,
+                   int                      lda);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcoo2csr(cusparseHandle_t    handle,
+                 const int*          cooRowInd,
+                 int                 nnz,
+                 int                 m,
+                 int*                csrSortedRowPtr,
+                 cusparseIndexBase_t idxBase);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsr2coo(cusparseHandle_t    handle,
+                 const int*          csrSortedRowPtr,
+                 int                 nnz,
+                 int                 m,
+                 int*                cooRowInd,
+                 cusparseIndexBase_t idxBase);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsr2bsrNnz(cusparseHandle_t         handle,
+                    cusparseDirection_t      dirA,
+                    int                      m,
+                    int                      n,
+                    const cusparseMatDescr_t descrA,
+                    const int*               csrSortedRowPtrA,
+                    const int*               csrSortedColIndA,
+                    int                      blockDim,
+                    const cusparseMatDescr_t descrC,
+                    int*                     bsrSortedRowPtrC,
+                    int*                     nnzTotalDevHostPtr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2bsr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      m,
+                 int                      n,
+                 const cusparseMatDescr_t descrA,
+                 const float*             csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 float*                   bsrSortedValC,
+                 int*                     bsrSortedRowPtrC,
+                 int*                     bsrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2bsr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      m,
+                 int                      n,
+                 const cusparseMatDescr_t descrA,
+                 const double*            csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 double*                  bsrSortedValC,
+                 int*                     bsrSortedRowPtrC,
+                 int*                     bsrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2bsr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      m,
+                 int                      n,
+                 const cusparseMatDescr_t descrA,
+                 const cuComplex*         csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 cuComplex*               bsrSortedValC,
+                 int*                     bsrSortedRowPtrC,
+                 int*                     bsrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2bsr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      m,
+                 int                      n,
+                 const cusparseMatDescr_t descrA,
+                 const cuDoubleComplex*   csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 cuDoubleComplex*         bsrSortedValC,
+                 int*                     bsrSortedRowPtrC,
+                 int*                     bsrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsr2csr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nb,
+                 const cusparseMatDescr_t descrA,
+                 const float*             bsrSortedValA,
+                 const int*               bsrSortedRowPtrA,
+                 const int*               bsrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 float*                   csrSortedValC,
+                 int*                     csrSortedRowPtrC,
+                 int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsr2csr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nb,
+                 const cusparseMatDescr_t descrA,
+                 const double*            bsrSortedValA,
+                 const int*               bsrSortedRowPtrA,
+                 const int*               bsrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 double*                  csrSortedValC,
+                 int*                     csrSortedRowPtrC,
+                 int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsr2csr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nb,
+                 const cusparseMatDescr_t descrA,
+                 const cuComplex*         bsrSortedValA,
+                 const int*               bsrSortedRowPtrA,
+                 const int*               bsrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 cuComplex*               csrSortedValC,
+                 int*                     csrSortedRowPtrC,
+                 int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsr2csr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nb,
+                 const cusparseMatDescr_t descrA,
+                 const cuDoubleComplex*   bsrSortedValA,
+                 const int*               bsrSortedRowPtrA,
+                 const int*               bsrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 cuDoubleComplex*         csrSortedValC,
+                 int*                     csrSortedRowPtrC,
+                 int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsc_bufferSize(cusparseHandle_t handle,
+                                int              mb,
+                                int              nb,
+                                int              nnzb,
+                                const float*     bsrSortedVal,
+                                const int*       bsrSortedRowPtr,
+                                const int*       bsrSortedColInd,
+                                int              rowBlockDim,
+                                int              colBlockDim,
+                                int*             pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsc_bufferSize(cusparseHandle_t handle,
+                                int              mb,
+                                int              nb,
+                                int              nnzb,
+                                const double*    bsrSortedVal,
+                                const int*       bsrSortedRowPtr,
+                                const int*       bsrSortedColInd,
+                                int              rowBlockDim,
+                                int              colBlockDim,
+                                int*             pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsc_bufferSize(cusparseHandle_t handle,
+                                int              mb,
+                                int              nb,
+                                int              nnzb,
+                                const cuComplex* bsrSortedVal,
+                                const int*       bsrSortedRowPtr,
+                                const int*       bsrSortedColInd,
+                                int              rowBlockDim,
+                                int              colBlockDim,
+                                int*             pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsc_bufferSize(cusparseHandle_t       handle,
+                                int                    mb,
+                                int                    nb,
+                                int                    nnzb,
+                                const cuDoubleComplex* bsrSortedVal,
+                                const int*             bsrSortedRowPtr,
+                                const int*             bsrSortedColInd,
+                                int                    rowBlockDim,
+                                int                    colBlockDim,
+                                int*                   pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle,
+                                   int              mb,
+                                   int              nb,
+                                   int              nnzb,
+                                   const float*     bsrSortedVal,
+                                   const int*       bsrSortedRowPtr,
+                                   const int*       bsrSortedColInd,
+                                   int              rowBlockDim,
+                                   int              colBlockDim,
+                                   size_t*          pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle,
+                                   int              mb,
+                                   int              nb,
+                                   int              nnzb,
+                                   const double*    bsrSortedVal,
+                                   const int*       bsrSortedRowPtr,
+                                   const int*       bsrSortedColInd,
+                                   int              rowBlockDim,
+                                   int              colBlockDim,
+                                   size_t*          pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle,
+                                   int              mb,
+                                   int              nb,
+                                   int              nnzb,
+                                   const cuComplex* bsrSortedVal,
+                                   const int*       bsrSortedRowPtr,
+                                   const int*       bsrSortedColInd,
+                                   int              rowBlockDim,
+                                   int              colBlockDim,
+                                   size_t*          pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsc_bufferSizeExt(cusparseHandle_t       handle,
+                                   int                    mb,
+                                   int                    nb,
+                                   int                    nnzb,
+                                   const cuDoubleComplex* bsrSortedVal,
+                                   const int*             bsrSortedRowPtr,
+                                   const int*             bsrSortedColInd,
+                                   int                    rowBlockDim,
+                                   int                    colBlockDim,
+                                   size_t*                pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsc(cusparseHandle_t handle,
+                     int              mb,
+                     int              nb,
+                     int              nnzb,
+                     const float*     bsrSortedVal,
+                     const int* bsrSortedRowPtr,
+                     const int* bsrSortedColInd,
+                     int        rowBlockDim,
+                     int        colBlockDim,
+                     float*     bscVal,
+                     int*       bscRowInd,
+                     int*       bscColPtr,
+                     cusparseAction_t copyValues,
+                     cusparseIndexBase_t idxBase,
+                     void*               pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsc(cusparseHandle_t    handle,
+                     int                 mb,
+                     int                 nb,
+                     int                 nnzb,
+                     const double*       bsrSortedVal,
+                     const int*          bsrSortedRowPtr,
+                     const int*          bsrSortedColInd,
+                     int                 rowBlockDim,
+                     int                 colBlockDim,
+                     double*             bscVal,
+                     int*                bscRowInd,
+                     int*                bscColPtr,
+                     cusparseAction_t    copyValues,
+                     cusparseIndexBase_t idxBase,
+                     void*               pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsc(cusparseHandle_t    handle,
+                     int                 mb,
+                     int                 nb,
+                     int                 nnzb,
+                     const cuComplex*    bsrSortedVal,
+                     const int*          bsrSortedRowPtr,
+                     const int*          bsrSortedColInd,
+                     int                 rowBlockDim,
+                     int                 colBlockDim,
+                     cuComplex*          bscVal,
+                     int*                bscRowInd,
+                     int*                bscColPtr,
+                     cusparseAction_t    copyValues,
+                     cusparseIndexBase_t idxBase,
+                     void*               pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsc(cusparseHandle_t       handle,
+                     int                    mb,
+                     int                    nb,
+                     int                    nnzb,
+                     const cuDoubleComplex* bsrSortedVal,
+                     const int*             bsrSortedRowPtr,
+                     const int*             bsrSortedColInd,
+                     int                    rowBlockDim,
+                     int                    colBlockDim,
+                     cuDoubleComplex*       bscVal,
+                     int*                   bscRowInd,
+                     int*                   bscColPtr,
+                     cusparseAction_t       copyValues,
+                     cusparseIndexBase_t    idxBase,
+                     void*                  pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const float*             bsrSortedValA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   float*                   csrSortedValC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const double*            bsrSortedValA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   double*                  csrSortedValC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const cuComplex*         bsrSortedValA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   cuComplex*               csrSortedValC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const cuDoubleComplex*   bsrSortedValA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   cuDoubleComplex*         csrSortedValC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              int                      m,
+                              int                      n,
+                              const cusparseMatDescr_t descrA,
+                              const float*             csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              int                      rowBlockDim,
+                              int                      colBlockDim,
+                              int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              int                      m,
+                              int                      n,
+                              const cusparseMatDescr_t descrA,
+                              const double*            csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              int                      rowBlockDim,
+                              int                      colBlockDim,
+                              int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              int                      m,
+                              int                      n,
+                              const cusparseMatDescr_t descrA,
+                              const cuComplex*         csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              int                      rowBlockDim,
+                              int                      colBlockDim,
+                              int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              int                      m,
+                              int                      n,
+                              const cusparseMatDescr_t descrA,
+                              const cuDoubleComplex*   csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              int                      rowBlockDim,
+                              int                      colBlockDim,
+                              int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                 cusparseDirection_t      dirA,
+                                 int                      m,
+                                 int                      n,
+                                 const cusparseMatDescr_t descrA,
+                                 const float*             csrSortedValA,
+                                 const int*               csrSortedRowPtrA,
+                                 const int*               csrSortedColIndA,
+                                 int                      rowBlockDim,
+                                 int                      colBlockDim,
+                                 size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                 cusparseDirection_t      dirA,
+                                 int                      m,
+                                 int                      n,
+                                 const cusparseMatDescr_t descrA,
+                                 const double*            csrSortedValA,
+                                 const int*               csrSortedRowPtrA,
+                                 const int*               csrSortedColIndA,
+                                 int                      rowBlockDim,
+                                 int                      colBlockDim,
+                                 size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                 cusparseDirection_t      dirA,
+                                 int                      m,
+                                 int                      n,
+                                 const cusparseMatDescr_t descrA,
+                                 const cuComplex*         csrSortedValA,
+                                 const int*               csrSortedRowPtrA,
+                                 const int*               csrSortedColIndA,
+                                 int                      rowBlockDim,
+                                 int                      colBlockDim,
+                                 size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                 cusparseDirection_t      dirA,
+                                 int                      m,
+                                 int                      n,
+                                 const cusparseMatDescr_t descrA,
+                                 const cuDoubleComplex*   csrSortedValA,
+                                 const int*               csrSortedRowPtrA,
+                                 const int*               csrSortedColIndA,
+                                 int                      rowBlockDim,
+                                 int                      colBlockDim,
+                                 size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsr2gebsrNnz(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      int                      m,
+                      int                      n,
+                      const cusparseMatDescr_t descrA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      const cusparseMatDescr_t descrC,
+                      int*                     bsrSortedRowPtrC,
+                      int                      rowBlockDim,
+                      int                      colBlockDim,
+                      int*                     nnzTotalDevHostPtr,
+                      void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2gebsr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const float*             csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   const cusparseMatDescr_t descrC,
+                   float*                   bsrSortedValC,
+                   int*                     bsrSortedRowPtrC,
+                   int*                     bsrSortedColIndC,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2gebsr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const double*            csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   const cusparseMatDescr_t descrC,
+                   double*                  bsrSortedValC,
+                   int*                     bsrSortedRowPtrC,
+                   int*                     bsrSortedColIndC,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2gebsr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuComplex*         csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   const cusparseMatDescr_t descrC,
+                   cuComplex*               bsrSortedValC,
+                   int*                     bsrSortedRowPtrC,
+                   int*                     bsrSortedColIndC,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2gebsr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuDoubleComplex*   csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   const cusparseMatDescr_t descrC,
+                   cuDoubleComplex*         bsrSortedValC,
+                   int*                     bsrSortedRowPtrC,
+                   int*                     bsrSortedColIndC,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                const float*             bsrSortedValA,
+                                const int*               bsrSortedRowPtrA,
+                                const int*               bsrSortedColIndA,
+                                int                      rowBlockDimA,
+                                int                      colBlockDimA,
+                                int                      rowBlockDimC,
+                                int                      colBlockDimC,
+                                int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                const double*            bsrSortedValA,
+                                const int*               bsrSortedRowPtrA,
+                                const int*               bsrSortedColIndA,
+                                int                      rowBlockDimA,
+                                int                      colBlockDimA,
+                                int                      rowBlockDimC,
+                                int                      colBlockDimC,
+                                int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                const cuComplex*         bsrSortedValA,
+                                const int*               bsrSortedRowPtrA,
+                                const int*               bsrSortedColIndA,
+                                int                      rowBlockDimA,
+                                int                      colBlockDimA,
+                                int                      rowBlockDimC,
+                                int                      colBlockDimC,
+                                int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                const cuDoubleComplex*   bsrSortedValA,
+                                const int*               bsrSortedRowPtrA,
+                                const int*               bsrSortedColIndA,
+                                int                      rowBlockDimA,
+                                int                      colBlockDimA,
+                                int                      rowBlockDimC,
+                                int                      colBlockDimC,
+                                int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                   cusparseDirection_t      dirA,
+                                   int                      mb,
+                                   int                      nb,
+                                   int                      nnzb,
+                                   const cusparseMatDescr_t descrA,
+                                   const float*             bsrSortedValA,
+                                   const int*               bsrSortedRowPtrA,
+                                   const int*               bsrSortedColIndA,
+                                   int                      rowBlockDimA,
+                                   int                      colBlockDimA,
+                                   int                      rowBlockDimC,
+                                   int                      colBlockDimC,
+                                   size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                   cusparseDirection_t      dirA,
+                                   int                      mb,
+                                   int                      nb,
+                                   int                      nnzb,
+                                   const cusparseMatDescr_t descrA,
+                                   const double*            bsrSortedValA,
+                                   const int*               bsrSortedRowPtrA,
+                                   const int*               bsrSortedColIndA,
+                                   int                      rowBlockDimA,
+                                   int                      colBlockDimA,
+                                   int                      rowBlockDimC,
+                                   int                      colBlockDimC,
+                                   size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                   cusparseDirection_t      dirA,
+                                   int                      mb,
+                                   int                      nb,
+                                   int                      nnzb,
+                                   const cusparseMatDescr_t descrA,
+                                   const cuComplex*         bsrSortedValA,
+                                   const int*               bsrSortedRowPtrA,
+                                   const int*               bsrSortedColIndA,
+                                   int                      rowBlockDimA,
+                                   int                      colBlockDimA,
+                                   int                      rowBlockDimC,
+                                   int                      colBlockDimC,
+                                   size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                   cusparseDirection_t      dirA,
+                                   int                      mb,
+                                   int                      nb,
+                                   int                      nnzb,
+                                   const cusparseMatDescr_t descrA,
+                                   const cuDoubleComplex*   bsrSortedValA,
+                                   const int*               bsrSortedRowPtrA,
+                                   const int*               bsrSortedColIndA,
+                                   int                      rowBlockDimA,
+                                   int                      colBlockDimA,
+                                   int                      rowBlockDimC,
+                                   int                      colBlockDimC,
+                                   size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXgebsr2gebsrNnz(cusparseHandle_t         handle,
+                        cusparseDirection_t      dirA,
+                        int                      mb,
+                        int                      nb,
+                        int                      nnzb,
+                        const cusparseMatDescr_t descrA,
+                        const int*               bsrSortedRowPtrA,
+                        const int*               bsrSortedColIndA,
+                        int                      rowBlockDimA,
+                        int                      colBlockDimA,
+                        const cusparseMatDescr_t descrC,
+                        int*                     bsrSortedRowPtrC,
+                        int                      rowBlockDimC,
+                        int                      colBlockDimC,
+                        int*                     nnzTotalDevHostPtr,
+                        void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsr(cusparseHandle_t         handle,
+                     cusparseDirection_t      dirA,
+                     int                      mb,
+                     int                      nb,
+                     int                      nnzb,
+                     const cusparseMatDescr_t descrA,
+                     const float*             bsrSortedValA,
+                     const int*               bsrSortedRowPtrA,
+                     const int*               bsrSortedColIndA,
+                     int                      rowBlockDimA,
+                     int                      colBlockDimA,
+                     const cusparseMatDescr_t descrC,
+                     float*                   bsrSortedValC,
+                     int*                     bsrSortedRowPtrC,
+                     int*                     bsrSortedColIndC,
+                     int                      rowBlockDimC,
+                     int                      colBlockDimC,
+                     void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsr(cusparseHandle_t         handle,
+                     cusparseDirection_t      dirA,
+                     int                      mb,
+                     int                      nb,
+                     int                      nnzb,
+                     const cusparseMatDescr_t descrA,
+                     const double*            bsrSortedValA,
+                     const int*               bsrSortedRowPtrA,
+                     const int*               bsrSortedColIndA,
+                     int                      rowBlockDimA,
+                     int                      colBlockDimA,
+                     const cusparseMatDescr_t descrC,
+                     double*                  bsrSortedValC,
+                     int*                     bsrSortedRowPtrC,
+                     int*                     bsrSortedColIndC,
+                     int                      rowBlockDimC,
+                     int                      colBlockDimC,
+                     void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsr(cusparseHandle_t         handle,
+                     cusparseDirection_t      dirA,
+                     int                      mb,
+                     int                      nb,
+                     int                      nnzb,
+                     const cusparseMatDescr_t descrA,
+                     const cuComplex*         bsrSortedValA,
+                     const int*               bsrSortedRowPtrA,
+                     const int*               bsrSortedColIndA,
+                     int                      rowBlockDimA,
+                     int                      colBlockDimA,
+                     const cusparseMatDescr_t descrC,
+                     cuComplex*               bsrSortedValC,
+                     int*                     bsrSortedRowPtrC,
+                     int*                     bsrSortedColIndC,
+                     int                      rowBlockDimC,
+                     int                      colBlockDimC,
+                     void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsr(cusparseHandle_t         handle,
+                     cusparseDirection_t      dirA,
+                     int                      mb,
+                     int                      nb,
+                     int                      nnzb,
+                     const cusparseMatDescr_t descrA,
+                     const cuDoubleComplex*   bsrSortedValA,
+                     const int*               bsrSortedRowPtrA,
+                     const int*               bsrSortedColIndA,
+                     int                      rowBlockDimA,
+                     int                      colBlockDimA,
+                     const cusparseMatDescr_t descrC,
+                     cuDoubleComplex*         bsrSortedValC,
+                     int*                     bsrSortedRowPtrC,
+                     int*                     bsrSortedColIndC,
+                     int                      rowBlockDimC,
+                     int                      colBlockDimC,
+                     void*                    pBuffer);
+
+//##############################################################################
+//# SPARSE MATRIX SORTING
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateIdentityPermutation(cusparseHandle_t handle,
+                                  int              n,
+                                  int*             p);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcoosort_bufferSizeExt(cusparseHandle_t handle,
+                               int              m,
+                               int              n,
+                               int              nnz,
+                               const int*       cooRowsA,
+                               const int*       cooColsA,
+                               size_t*          pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcoosortByRow(cusparseHandle_t handle,
+                      int              m,
+                      int              n,
+                      int              nnz,
+                      int*             cooRowsA,
+                      int*             cooColsA,
+                      int*             P,
+                      void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcoosortByColumn(cusparseHandle_t handle,
+                         int              m,
+                         int              n,
+                         int              nnz,
+                         int*             cooRowsA,
+                         int*             cooColsA,
+                         int*             P,
+                         void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrsort_bufferSizeExt(cusparseHandle_t handle,
+                               int              m,
+                               int              n,
+                               int              nnz,
+                               const int*       csrRowPtrA,
+                               const int*       csrColIndA,
+                               size_t*          pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrsort(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      n,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 const int*               csrRowPtrA,
+                 int*                     csrColIndA,
+                 int*                     P,
+                 void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcscsort_bufferSizeExt(cusparseHandle_t handle,
+                               int              m,
+                               int              n,
+                               int              nnz,
+                               const int*       cscColPtrA,
+                               const int*       cscRowIndA,
+                               size_t*          pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcscsort(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      n,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 const int*               cscColPtrA,
+                 int*                     cscRowIndA,
+                 int*                     P,
+                 void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsru2csr_bufferSizeExt(cusparseHandle_t handle,
+                                int              m,
+                                int              n,
+                                int              nnz,
+                                float*           csrVal,
+                                const int*       csrRowPtr,
+                                int*             csrColInd,
+                                csru2csrInfo_t   info,
+                                size_t*          pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsru2csr_bufferSizeExt(cusparseHandle_t handle,
+                                int              m,
+                                int              n,
+                                int              nnz,
+                                double*          csrVal,
+                                const int*       csrRowPtr,
+                                int*             csrColInd,
+                                csru2csrInfo_t   info,
+                                size_t*          pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsru2csr_bufferSizeExt(cusparseHandle_t handle,
+                                int              m,
+                                int              n,
+                                int              nnz,
+                                cuComplex*       csrVal,
+                                const int*       csrRowPtr,
+                                int*             csrColInd,
+                                csru2csrInfo_t   info,
+                                size_t*          pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsru2csr_bufferSizeExt(cusparseHandle_t handle,
+                                int              m,
+                                int              n,
+                                int              nnz,
+                                cuDoubleComplex* csrVal,
+                                const int*       csrRowPtr,
+                                int*             csrColInd,
+                                csru2csrInfo_t   info,
+                                size_t*          pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsru2csr(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  float*                   csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsru2csr(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  double*                  csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsru2csr(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuComplex*               csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsru2csr(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuDoubleComplex*         csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2csru(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  float*                   csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2csru(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  double*                  csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2csru(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuComplex*               csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2csru(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuDoubleComplex*         csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+#if defined(__cplusplus)
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneDense2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                      int                      m,
+                                      int                      n,
+                                      const __half*            A,
+                                      int                      lda,
+                                      const __half*            threshold,
+                                      const cusparseMatDescr_t descrC,
+                                      const __half*            csrSortedValC,
+                                      const int*               csrSortedRowPtrC,
+                                      const int*               csrSortedColIndC,
+                                      size_t* pBufferSizeInBytes);
+#endif // defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                      int                      m,
+                                      int                      n,
+                                      const float*             A,
+                                      int                      lda,
+                                      const float*             threshold,
+                                      const cusparseMatDescr_t descrC,
+                                      const float*             csrSortedValC,
+                                      const int*               csrSortedRowPtrC,
+                                      const int*               csrSortedColIndC,
+                                      size_t* pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                      int                      m,
+                                      int                      n,
+                                      const double*            A,
+                                      int                      lda,
+                                      const double*            threshold,
+                                      const cusparseMatDescr_t descrC,
+                                      const double*            csrSortedValC,
+                                      const int*               csrSortedRowPtrC,
+                                      const int*               csrSortedColIndC,
+                                      size_t*               pBufferSizeInBytes);
+
+#if defined(__cplusplus)
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneDense2csrNnz(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      n,
+                           const __half*            A,
+                           int                      lda,
+                           const __half*            threshold,
+                           const cusparseMatDescr_t descrC,
+                           int*                     csrRowPtrC,
+                           int*                     nnzTotalDevHostPtr,
+                           void*                    pBuffer);
+#endif // defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csrNnz(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      n,
+                           const float*             A,
+                           int                      lda,
+                           const float*             threshold,
+                           const cusparseMatDescr_t descrC,
+                           int*                     csrRowPtrC,
+                           int*                     nnzTotalDevHostPtr,
+                           void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csrNnz(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      n,
+                           const double*            A,
+                           int                      lda,
+                           const double*            threshold,
+                           const cusparseMatDescr_t descrC,
+                           int*                     csrSortedRowPtrC,
+                           int*                     nnzTotalDevHostPtr,
+                           void*                    pBuffer);
+
+#if defined(__cplusplus)
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneDense2csr(cusparseHandle_t         handle,
+                        int                      m,
+                        int                      n,
+                        const __half*            A,
+                        int                      lda,
+                        const __half*            threshold,
+                        const cusparseMatDescr_t descrC,
+                        __half*                  csrSortedValC,
+                        const int*               csrSortedRowPtrC,
+                        int*                     csrSortedColIndC,
+                        void*                    pBuffer);
+#endif // defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csr(cusparseHandle_t         handle,
+                        int                      m,
+                        int                      n,
+                        const float*             A,
+                        int                      lda,
+                        const float*             threshold,
+                        const cusparseMatDescr_t descrC,
+                        float*                   csrSortedValC,
+                        const int*               csrSortedRowPtrC,
+                        int*                     csrSortedColIndC,
+                        void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csr(cusparseHandle_t         handle,
+                        int                      m,
+                        int                      n,
+                        const double*            A,
+                        int                      lda,
+                        const double*            threshold,
+                        const cusparseMatDescr_t descrC,
+                        double*                  csrSortedValC,
+                        const int*               csrSortedRowPtrC,
+                        int*                     csrSortedColIndC,
+                        void*                    pBuffer);
+
+#if defined(__cplusplus)
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneCsr2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const __half*            csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    const __half*            threshold,
+                                    const cusparseMatDescr_t descrC,
+                                    const __half*            csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    const int*               csrSortedColIndC,
+                                    size_t* pBufferSizeInBytes);
+#endif // defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const float*             csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    const float*             threshold,
+                                    const cusparseMatDescr_t descrC,
+                                    const float*             csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    const int*               csrSortedColIndC,
+                                    size_t*                 pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const double*            csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    const double*            threshold,
+                                    const cusparseMatDescr_t descrC,
+                                    const double*            csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    const int*               csrSortedColIndC,
+                                    size_t*                 pBufferSizeInBytes);
+
+#if defined(__cplusplus)
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneCsr2csrNnz(cusparseHandle_t         handle,
+                         int                      m,
+                         int                      n,
+                         int                      nnzA,
+                         const cusparseMatDescr_t descrA,
+                         const __half*            csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const __half*            threshold,
+                         const cusparseMatDescr_t descrC,
+                         int*                     csrSortedRowPtrC,
+                         int*                     nnzTotalDevHostPtr,
+                         void*                    pBuffer);
+#endif // defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csrNnz(cusparseHandle_t         handle,
+                         int                      m,
+                         int                      n,
+                         int                      nnzA,
+                         const cusparseMatDescr_t descrA,
+                         const float*             csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const float*             threshold,
+                         const cusparseMatDescr_t descrC,
+                         int*                     csrSortedRowPtrC,
+                         int*                     nnzTotalDevHostPtr,
+                         void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+ cusparseDpruneCsr2csrNnz(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          int                      nnzA,
+                          const cusparseMatDescr_t descrA,
+                          const double*            csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          const double*            threshold,
+                          const cusparseMatDescr_t descrC,
+                          int*                     csrSortedRowPtrC,
+                          int*                     nnzTotalDevHostPtr,
+                          void*                    pBuffer);
+
+#if defined(__cplusplus)
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneCsr2csr(cusparseHandle_t         handle,
+                      int                      m,
+                      int                      n,
+                      int                      nnzA,
+                      const cusparseMatDescr_t descrA,
+                      const __half*            csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      const __half*            threshold,
+                      const cusparseMatDescr_t descrC,
+                      __half*                  csrSortedValC,
+                      const int*               csrSortedRowPtrC,
+                      int*                     csrSortedColIndC,
+                      void*                    pBuffer);
+#endif // defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csr(cusparseHandle_t         handle,
+                      int                      m,
+                      int                      n,
+                      int                      nnzA,
+                      const cusparseMatDescr_t descrA,
+                      const float*             csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      const float*             threshold,
+                      const cusparseMatDescr_t descrC,
+                      float*                   csrSortedValC,
+                      const int*               csrSortedRowPtrC,
+                      int*                     csrSortedColIndC,
+                      void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csr(cusparseHandle_t         handle,
+                      int                      m,
+                      int                      n,
+                      int                      nnzA,
+                      const cusparseMatDescr_t descrA,
+                      const double*            csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      const double*            threshold,
+                      const cusparseMatDescr_t descrC,
+                      double*                  csrSortedValC,
+                      const int*               csrSortedRowPtrC,
+                      int*                     csrSortedColIndC,
+                      void*                    pBuffer);
+
+#if defined(__cplusplus)
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneDense2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   const __half*            A,
+                                   int                      lda,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const __half*            csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes);
+#endif // defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   const float*             A,
+                                   int                      lda,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const float*             csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   const double*            A,
+                                   int                      lda,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const double*            csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes);
+
+#if defined(__cplusplus)
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneDense2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const __half*            A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+#endif // defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const float*             A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const double*            A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+#if defined(__cplusplus)
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneDense2csrByPercentage(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const __half*            A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    __half*                  csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    int*                     csrSortedColIndC,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+#endif // defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csrByPercentage(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const float*             A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    float*                   csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    int*                     csrSortedColIndC,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csrByPercentage(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const double*            A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    double*                  csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    int*                     csrSortedColIndC,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+#if defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneCsr2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   int                      nnzA,
+                                   const cusparseMatDescr_t descrA,
+                                   const __half*            csrSortedValA,
+                                   const int*               csrSortedRowPtrA,
+                                   const int*               csrSortedColIndA,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const __half*            csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes);
+
+#endif // defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   int                      nnzA,
+                                   const cusparseMatDescr_t descrA,
+                                   const float*             csrSortedValA,
+                                   const int*               csrSortedRowPtrA,
+                                   const int*               csrSortedColIndA,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const float*             csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   int                      nnzA,
+                                   const cusparseMatDescr_t descrA,
+                                   const double*            csrSortedValA,
+                                   const int*               csrSortedRowPtrA,
+                                   const int*               csrSortedColIndA,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const double*            csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes);
+
+#if defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneCsr2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const __half*            csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrSortedRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+#endif // defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const float*             csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrSortedRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const double*            csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrSortedRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+#if defined(__cplusplus)
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneCsr2csrByPercentage(cusparseHandle_t         handle,
+                                  int                      m,
+                                  int                      n,
+                                  int                      nnzA,
+                                  const cusparseMatDescr_t descrA,
+                                  const __half*            csrSortedValA,
+                                  const int*               csrSortedRowPtrA,
+                                  const int*               csrSortedColIndA,
+                                  float percentage, /* between 0 to 100 */
+                                  const cusparseMatDescr_t descrC,
+                                  __half*                  csrSortedValC,
+                                  const int*               csrSortedRowPtrC,
+                                  int*                     csrSortedColIndC,
+                                  pruneInfo_t              info,
+                                  void*                    pBuffer);
+
+#endif // defined(__cplusplus)
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csrByPercentage(cusparseHandle_t         handle,
+                                  int                      m,
+                                  int                      n,
+                                  int                      nnzA,
+                                  const cusparseMatDescr_t descrA,
+                                  const float*             csrSortedValA,
+                                  const int*               csrSortedRowPtrA,
+                                  const int*               csrSortedColIndA,
+                                  float                    percentage,
+                                  const cusparseMatDescr_t descrC,
+                                  float*                   csrSortedValC,
+                                  const int*               csrSortedRowPtrC,
+                                  int*                     csrSortedColIndC,
+                                  pruneInfo_t              info,
+                                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csrByPercentage(cusparseHandle_t         handle,
+                                  int                      m,
+                                  int                      n,
+                                  int                      nnzA,
+                                  const cusparseMatDescr_t descrA,
+                                  const double*            csrSortedValA,
+                                  const int*               csrSortedRowPtrA,
+                                  const int*               csrSortedColIndA,
+                                  float                    percentage,
+                                  const cusparseMatDescr_t descrC,
+                                  double*                  csrSortedValC,
+                                  const int*               csrSortedRowPtrC,
+                                  int*                     csrSortedColIndC,
+                                  pruneInfo_t              info,
+                                  void*                    pBuffer);
+
+//##############################################################################
+//# CSR2CSC
+//##############################################################################
+
+typedef enum {
+    CUSPARSE_CSR2CSC_ALG1 = 1, // faster than V2 (in general), deterministc
+    CUSPARSE_CSR2CSC_ALG2 = 2  // low memory requirement, non-deterministc
+} cusparseCsr2CscAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsr2cscEx2(cusparseHandle_t     handle,
+                   int                  m,
+                   int                  n,
+                   int                  nnz,
+                   const void*          csrVal,
+                   const int*           csrRowPtr,
+                   const int*           csrColInd,
+                   void*                cscVal,
+                   int*                 cscColPtr,
+                   int*                 cscRowInd,
+                   cudaDataType         valType,
+                   cusparseAction_t     copyValues,
+                   cusparseIndexBase_t  idxBase,
+                   cusparseCsr2CscAlg_t alg,
+                   void*                buffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsr2cscEx2_bufferSize(cusparseHandle_t     handle,
+                              int                  m,
+                              int                  n,
+                              int                  nnz,
+                              const void*          csrVal,
+                              const int*           csrRowPtr,
+                              const int*           csrColInd,
+                              void*                cscVal,
+                              int*                 cscColPtr,
+                              int*                 cscRowInd,
+                              cudaDataType         valType,
+                              cusparseAction_t     copyValues,
+                              cusparseIndexBase_t  idxBase,
+                              cusparseCsr2CscAlg_t alg,
+                              size_t*              bufferSize);
+
+// #############################################################################
+// # GENERIC APIs - Enumerators and Opaque Data Structures
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_FORMAT_CSR         = 1, ///< Compressed Sparse Row (CSR)
+    CUSPARSE_FORMAT_CSC         = 2, ///< Compressed Sparse Column (CSC)
+    CUSPARSE_FORMAT_COO         = 3, ///< Coordinate (COO) - Structure of Arrays
+    CUSPARSE_FORMAT_COO_AOS     = 4, ///< Coordinate (COO) - Array of Structures
+    CUSPARSE_FORMAT_BLOCKED_ELL = 5, ///< Blocked ELL
+} cusparseFormat_t;
+
+typedef enum {
+    CUSPARSE_ORDER_COL = 1, ///< Column-Major Order - Matrix memory layout
+    CUSPARSE_ORDER_ROW = 2  ///< Row-Major Order - Matrix memory layout
+} cusparseOrder_t;
+
+typedef enum {
+    CUSPARSE_INDEX_16U = 1, ///< 16-bit unsigned integer for matrix/vector
+                            ///< indices
+    CUSPARSE_INDEX_32I = 2, ///< 32-bit signed integer for matrix/vector indices
+    CUSPARSE_INDEX_64I = 3  ///< 64-bit signed integer for matrix/vector indices
+} cusparseIndexType_t;
+
+//------------------------------------------------------------------------------
+
+struct cusparseSpVecDescr;
+struct cusparseDnVecDescr;
+struct cusparseSpMatDescr;
+struct cusparseDnMatDescr;
+typedef struct cusparseSpVecDescr* cusparseSpVecDescr_t;
+typedef struct cusparseDnVecDescr* cusparseDnVecDescr_t;
+typedef struct cusparseSpMatDescr* cusparseSpMatDescr_t;
+typedef struct cusparseDnMatDescr* cusparseDnMatDescr_t;
+
+// #############################################################################
+// # SPARSE VECTOR DESCRIPTOR
+// #############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateSpVec(cusparseSpVecDescr_t* spVecDescr,
+                    int64_t               size,
+                    int64_t               nnz,
+                    void*                 indices,
+                    void*                 values,
+                    cusparseIndexType_t   idxType,
+                    cusparseIndexBase_t   idxBase,
+                    cudaDataType          valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpVec(cusparseSpVecDescr_t spVecDescr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGet(cusparseSpVecDescr_t spVecDescr,
+                 int64_t*             size,
+                 int64_t*             nnz,
+                 void**               indices,
+                 void**               values,
+                 cusparseIndexType_t* idxType,
+                 cusparseIndexBase_t* idxBase,
+                 cudaDataType*        valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGetIndexBase(cusparseSpVecDescr_t spVecDescr,
+                          cusparseIndexBase_t* idxBase);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGetValues(cusparseSpVecDescr_t spVecDescr,
+                       void**               values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecSetValues(cusparseSpVecDescr_t spVecDescr,
+                       void*                values);
+
+// #############################################################################
+// # DENSE VECTOR DESCRIPTOR
+// #############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateDnVec(cusparseDnVecDescr_t* dnVecDescr,
+                    int64_t               size,
+                    void*                 values,
+                    cudaDataType          valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnVec(cusparseDnVecDescr_t dnVecDescr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecGet(cusparseDnVecDescr_t dnVecDescr,
+                 int64_t*             size,
+                 void**               values,
+                 cudaDataType*        valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecGetValues(cusparseDnVecDescr_t dnVecDescr,
+                       void**               values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecSetValues(cusparseDnVecDescr_t dnVecDescr,
+                       void*                values);
+
+// #############################################################################
+// # SPARSE MATRIX DESCRIPTOR
+// #############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpMat(cusparseSpMatDescr_t spMatDescr);
+
+ cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetFormat(cusparseSpMatDescr_t spMatDescr,
+                       cusparseFormat_t*    format);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetIndexBase(cusparseSpMatDescr_t spMatDescr,
+                          cusparseIndexBase_t* idxBase);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetValues(cusparseSpMatDescr_t spMatDescr,
+                       void**               values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetValues(cusparseSpMatDescr_t spMatDescr,
+                       void*                values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetSize(cusparseSpMatDescr_t spMatDescr,
+                     int64_t*             rows,
+                     int64_t*             cols,
+                     int64_t*             nnz);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetStridedBatch(cusparseSpMatDescr_t spMatDescr,
+                             int                  batchCount);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetStridedBatch(cusparseSpMatDescr_t spMatDescr,
+                             int*                 batchCount);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCooSetStridedBatch(cusparseSpMatDescr_t spMatDescr,
+                            int                 batchCount,
+                            int64_t             batchStride);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrSetStridedBatch(cusparseSpMatDescr_t spMatDescr,
+                            int                 batchCount,
+                            int64_t             offsetsBatchStride,
+                            int64_t             columnsValuesBatchStride);
+
+typedef enum {
+    CUSPARSE_SPMAT_FILL_MODE,
+    CUSPARSE_SPMAT_DIAG_TYPE
+} cusparseSpMatAttribute_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetAttribute(cusparseSpMatDescr_t     spMatDescr,
+                          cusparseSpMatAttribute_t attribute,
+                          void*                    data,
+                          size_t                   dataSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetAttribute(cusparseSpMatDescr_t     spMatDescr,
+                          cusparseSpMatAttribute_t attribute,
+                          void*                    data,
+                          size_t                   dataSize);
+
+//------------------------------------------------------------------------------
+// ### CSR ###
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsr(cusparseSpMatDescr_t* spMatDescr,
+                  int64_t               rows,
+                  int64_t               cols,
+                  int64_t               nnz,
+                  void*                 csrRowOffsets,
+                  void*                 csrColInd,
+                  void*                 csrValues,
+                  cusparseIndexType_t   csrRowOffsetsType,
+                  cusparseIndexType_t   csrColIndType,
+                  cusparseIndexBase_t   idxBase,
+                  cudaDataType          valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsc(cusparseSpMatDescr_t* spMatDescr,
+                  int64_t               rows,
+                  int64_t               cols,
+                  int64_t               nnz,
+                  void*                 cscColOffsets,
+                  void*                 cscRowInd,
+                  void*                 cscValues,
+                  cusparseIndexType_t   cscColOffsetsType,
+                  cusparseIndexType_t   cscRowIndType,
+                  cusparseIndexBase_t   idxBase,
+                  cudaDataType          valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrGet(cusparseSpMatDescr_t spMatDescr,
+               int64_t*             rows,
+               int64_t*             cols,
+               int64_t*             nnz,
+               void**               csrRowOffsets,
+               void**               csrColInd,
+               void**               csrValues,
+               cusparseIndexType_t* csrRowOffsetsType,
+               cusparseIndexType_t* csrColIndType,
+               cusparseIndexBase_t* idxBase,
+               cudaDataType*        valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCscGet(cusparseSpMatDescr_t spMatDescr,
+               int64_t*             rows,
+               int64_t*             cols,
+               int64_t*             nnz,
+               void**               cscColOffsets,
+               void**               cscRowInd,
+               void**               cscValues,
+               cusparseIndexType_t* cscColOffsetsType,
+               cusparseIndexType_t* cscRowIndType,
+               cusparseIndexBase_t* idxBase,
+               cudaDataType*        valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrSetPointers(cusparseSpMatDescr_t spMatDescr,
+                       void*                csrRowOffsets,
+                       void*                csrColInd,
+                       void*                csrValues);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCscSetPointers(cusparseSpMatDescr_t spMatDescr,
+                       void*                cscColOffsets,
+                       void*                cscRowInd,
+                       void*                cscValues);
+
+//------------------------------------------------------------------------------
+// ### COO ###
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCoo(cusparseSpMatDescr_t* spMatDescr,
+                  int64_t               rows,
+                  int64_t               cols,
+                  int64_t               nnz,
+                  void*                 cooRowInd,
+                  void*                 cooColInd,
+                  void*                 cooValues,
+                  cusparseIndexType_t   cooIdxType,
+                  cusparseIndexBase_t   idxBase,
+                  cudaDataType          valueType);
+
+CUSPARSE_DEPRECATED(cusparseCreateCoo)
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCooAoS(cusparseSpMatDescr_t* spMatDescr,
+                     int64_t               rows,
+                     int64_t               cols,
+                     int64_t               nnz,
+                     void*                 cooInd,
+                     void*                 cooValues,
+                     cusparseIndexType_t   cooIdxType,
+                     cusparseIndexBase_t   idxBase,
+                     cudaDataType          valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCooGet(cusparseSpMatDescr_t spMatDescr,
+               int64_t*             rows,
+               int64_t*             cols,
+               int64_t*             nnz,
+               void**               cooRowInd,  // COO row indices
+               void**               cooColInd,  // COO column indices
+               void**               cooValues,  // COO values
+               cusparseIndexType_t* idxType,
+               cusparseIndexBase_t* idxBase,
+               cudaDataType*        valueType);
+
+CUSPARSE_DEPRECATED(cusparseCooGet)
+cusparseStatus_t CUSPARSEAPI
+cusparseCooAoSGet(cusparseSpMatDescr_t spMatDescr,
+                  int64_t*             rows,
+                  int64_t*             cols,
+                  int64_t*             nnz,
+                  void**               cooInd,     // COO indices
+                  void**               cooValues,  // COO values
+                  cusparseIndexType_t* idxType,
+                  cusparseIndexBase_t* idxBase,
+                  cudaDataType*        valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCooSetPointers(cusparseSpMatDescr_t spMatDescr,
+                       void*                cooRows,
+                       void*                cooColumns,
+                       void*                cooValues);
+
+//------------------------------------------------------------------------------
+// ### BLOCKED ELL ###
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBlockedEll(cusparseSpMatDescr_t* spMatDescr,
+                         int64_t               rows,
+                         int64_t               cols,
+                         int64_t               ellBlockSize,
+                         int64_t               ellCols,
+                         void*                 ellColInd,
+                         void*                 ellValue,
+                         cusparseIndexType_t   ellIdxType,
+                         cusparseIndexBase_t   idxBase,
+                         cudaDataType          valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseBlockedEllGet(cusparseSpMatDescr_t spMatDescr,
+                      int64_t*             rows,
+                      int64_t*             cols,
+                      int64_t*             ellBlockSize,
+                      int64_t*             ellCols,
+                      void**               ellColInd,
+                      void**               ellValue,
+                      cusparseIndexType_t* ellIdxType,
+                      cusparseIndexBase_t* idxBase,
+                      cudaDataType*        valueType);
+
+// #############################################################################
+// # DENSE MATRIX DESCRIPTOR
+// #############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateDnMat(cusparseDnMatDescr_t* dnMatDescr,
+                    int64_t               rows,
+                    int64_t               cols,
+                    int64_t               ld,
+                    void*                 values,
+                    cudaDataType          valueType,
+                    cusparseOrder_t       order);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnMat(cusparseDnMatDescr_t dnMatDescr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGet(cusparseDnMatDescr_t dnMatDescr,
+                 int64_t*             rows,
+                 int64_t*             cols,
+                 int64_t*             ld,
+                 void**               values,
+                 cudaDataType*        type,
+                 cusparseOrder_t*     order);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGetValues(cusparseDnMatDescr_t dnMatDescr,
+                       void**               values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatSetValues(cusparseDnMatDescr_t dnMatDescr,
+                       void*                values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatSetStridedBatch(cusparseDnMatDescr_t dnMatDescr,
+                             int                  batchCount,
+                             int64_t              batchStride);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGetStridedBatch(cusparseDnMatDescr_t dnMatDescr,
+                             int*                 batchCount,
+                             int64_t*             batchStride);
+
+// #############################################################################
+// # VECTOR-VECTOR OPERATIONS
+// #############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseAxpby(cusparseHandle_t     handle,
+              const void*          alpha,
+              cusparseSpVecDescr_t vecX,
+              const void*          beta,
+              cusparseDnVecDescr_t vecY);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGather(cusparseHandle_t     handle,
+               cusparseDnVecDescr_t vecY,
+               cusparseSpVecDescr_t vecX);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScatter(cusparseHandle_t     handle,
+                cusparseSpVecDescr_t vecX,
+                cusparseDnVecDescr_t vecY);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseRot(cusparseHandle_t     handle,
+            const void*          c_coeff,
+            const void*          s_coeff,
+            cusparseSpVecDescr_t vecX,
+            cusparseDnVecDescr_t vecY);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVV_bufferSize(cusparseHandle_t     handle,
+                        cusparseOperation_t  opX,
+                        cusparseSpVecDescr_t vecX,
+                        cusparseDnVecDescr_t vecY,
+                        const void*          result,
+                        cudaDataType         computeType,
+                        size_t*              bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVV(cusparseHandle_t     handle,
+             cusparseOperation_t  opX,
+             cusparseSpVecDescr_t vecX,
+             cusparseDnVecDescr_t vecY,
+             void*                result,
+             cudaDataType         computeType,
+             void*                externalBuffer);
+
+// #############################################################################
+// # SPARSE TO DENSE
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_SPARSETODENSE_ALG_DEFAULT = 0
+} cusparseSparseToDenseAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSparseToDense_bufferSize(cusparseHandle_t           handle,
+                                 cusparseSpMatDescr_t       matA,
+                                 cusparseDnMatDescr_t       matB,
+                                 cusparseSparseToDenseAlg_t alg,
+                                 size_t*                    bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSparseToDense(cusparseHandle_t           handle,
+                      cusparseSpMatDescr_t       matA,
+                      cusparseDnMatDescr_t       matB,
+                      cusparseSparseToDenseAlg_t alg,
+                      void*                      externalBuffer);
+
+
+// #############################################################################
+// # DENSE TO SPARSE
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_DENSETOSPARSE_ALG_DEFAULT = 0
+} cusparseDenseToSparseAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDenseToSparse_bufferSize(cusparseHandle_t           handle,
+                                 cusparseDnMatDescr_t       matA,
+                                 cusparseSpMatDescr_t       matB,
+                                 cusparseDenseToSparseAlg_t alg,
+                                 size_t*                    bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDenseToSparse_analysis(cusparseHandle_t           handle,
+                               cusparseDnMatDescr_t       matA,
+                               cusparseSpMatDescr_t       matB,
+                               cusparseDenseToSparseAlg_t alg,
+                               void*                      externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDenseToSparse_convert(cusparseHandle_t           handle,
+                              cusparseDnMatDescr_t       matA,
+                              cusparseSpMatDescr_t       matB,
+                              cusparseDenseToSparseAlg_t alg,
+                              void*                      externalBuffer);
+
+// #############################################################################
+// # SPARSE MATRIX-VECTOR MULTIPLICATION
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_MV_ALG_DEFAULT
+                        /*CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_ALG_DEFAULT)*/ = 0,
+    CUSPARSE_COOMV_ALG  CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_COO_ALG1)    = 1,
+    CUSPARSE_CSRMV_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_CSR_ALG1)    = 2,
+    CUSPARSE_CSRMV_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMV_CSR_ALG2)    = 3,
+    CUSPARSE_SPMV_ALG_DEFAULT = 0,
+    CUSPARSE_SPMV_CSR_ALG1    = 2,
+    CUSPARSE_SPMV_CSR_ALG2    = 3,
+    CUSPARSE_SPMV_COO_ALG1    = 1,
+    CUSPARSE_SPMV_COO_ALG2    = 4
+} cusparseSpMVAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMV(cusparseHandle_t     handle,
+             cusparseOperation_t  opA,
+             const void*          alpha,
+             cusparseSpMatDescr_t matA,
+             cusparseDnVecDescr_t vecX,
+             const void*          beta,
+             cusparseDnVecDescr_t vecY,
+             cudaDataType         computeType,
+             cusparseSpMVAlg_t    alg,
+             void*                externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMV_bufferSize(cusparseHandle_t    handle,
+                        cusparseOperation_t opA,
+                        const void*         alpha,
+                        cusparseSpMatDescr_t matA,
+                        cusparseDnVecDescr_t vecX,
+                        const void*          beta,
+                        cusparseDnVecDescr_t vecY,
+                        cudaDataType         computeType,
+                        cusparseSpMVAlg_t    alg,
+                        size_t*              bufferSize);
+
+// #############################################################################
+// # SPARSE TRIANGULAR VECTOR SOLVE
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_SPSV_ALG_DEFAULT = 0,
+} cusparseSpSVAlg_t;
+
+struct cusparseSpSVDescr;
+typedef struct cusparseSpSVDescr* cusparseSpSVDescr_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_createDescr(cusparseSpSVDescr_t* descr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_destroyDescr(cusparseSpSVDescr_t descr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_bufferSize(cusparseHandle_t     handle,
+                        cusparseOperation_t  opA,
+                        const void*          alpha,
+                        cusparseSpMatDescr_t matA,
+                        cusparseDnVecDescr_t vecX,
+                        cusparseDnVecDescr_t vecY,
+                        cudaDataType         computeType,
+                        cusparseSpSVAlg_t    alg,
+                        cusparseSpSVDescr_t  spsvDescr,
+                        size_t*              bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_analysis(cusparseHandle_t     handle,
+                      cusparseOperation_t  opA,
+                      const void*          alpha,
+                      cusparseSpMatDescr_t matA,
+                      cusparseDnVecDescr_t vecX,
+                      cusparseDnVecDescr_t vecY,
+                      cudaDataType         computeType,
+                      cusparseSpSVAlg_t    alg,
+                      cusparseSpSVDescr_t  spsvDescr,
+                      void*                externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_solve(cusparseHandle_t     handle,
+                   cusparseOperation_t  opA,
+                   const void*          alpha,
+                   cusparseSpMatDescr_t matA,
+                   cusparseDnVecDescr_t vecX,
+                   cusparseDnVecDescr_t vecY,
+                   cudaDataType         computeType,
+                   cusparseSpSVAlg_t    alg,
+                   cusparseSpSVDescr_t  spsvDescr);
+
+// #############################################################################
+// # SPARSE TRIANGULAR MATRIX SOLVE
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_SPSM_ALG_DEFAULT = 0,
+} cusparseSpSMAlg_t;
+
+struct cusparseSpSMDescr;
+typedef struct cusparseSpSMDescr* cusparseSpSMDescr_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_createDescr(cusparseSpSMDescr_t* descr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_destroyDescr(cusparseSpSMDescr_t descr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_bufferSize(cusparseHandle_t     handle,
+                        cusparseOperation_t  opA,
+                        cusparseOperation_t  opB,
+                        const void*          alpha,
+                        cusparseSpMatDescr_t matA,
+                        cusparseDnMatDescr_t matB,
+                        cusparseDnMatDescr_t matC,
+                        cudaDataType         computeType,
+                        cusparseSpSMAlg_t    alg,
+                        cusparseSpSMDescr_t  spsmDescr,
+                        size_t*              bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_analysis(cusparseHandle_t     handle,
+                        cusparseOperation_t  opA,
+                        cusparseOperation_t  opB,
+                        const void*          alpha,
+                        cusparseSpMatDescr_t matA,
+                        cusparseDnMatDescr_t matB,
+                        cusparseDnMatDescr_t matC,
+                        cudaDataType         computeType,
+                        cusparseSpSMAlg_t    alg,
+                        cusparseSpSMDescr_t  spsmDescr,
+                        void*                externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_solve(cusparseHandle_t     handle,
+                    cusparseOperation_t  opA,
+                    cusparseOperation_t  opB,
+                    const void*          alpha,
+                    cusparseSpMatDescr_t matA,
+                    cusparseDnMatDescr_t matB,
+                    cusparseDnMatDescr_t matC,
+                    cudaDataType         computeType,
+                    cusparseSpSMAlg_t    alg,
+                    cusparseSpSMDescr_t  spsmDescr);
+
+// #############################################################################
+// # SPARSE MATRIX-MATRIX MULTIPLICATION
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_MM_ALG_DEFAULT
+                        CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_ALG_DEFAULT) = 0,
+    CUSPARSE_COOMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG1) = 1,
+    CUSPARSE_COOMM_ALG2 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG2) = 2,
+    CUSPARSE_COOMM_ALG3 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_COO_ALG3) = 3,
+    CUSPARSE_CSRMM_ALG1 CUSPARSE_DEPRECATED_ENUM(CUSPARSE_SPMM_CSR_ALG1) = 4,
+    CUSPARSE_SPMM_ALG_DEFAULT      = 0,
+    CUSPARSE_SPMM_COO_ALG1         = 1,
+    CUSPARSE_SPMM_COO_ALG2         = 2,
+    CUSPARSE_SPMM_COO_ALG3         = 3,
+    CUSPARSE_SPMM_COO_ALG4         = 5,
+    CUSPARSE_SPMM_CSR_ALG1         = 4,
+    CUSPARSE_SPMM_CSR_ALG2         = 6,
+    CUSPARSE_SPMM_CSR_ALG3         = 12,
+    CUSPARSE_SPMM_BLOCKED_ELL_ALG1 = 13
+} cusparseSpMMAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMM_bufferSize(cusparseHandle_t     handle,
+                        cusparseOperation_t  opA,
+                        cusparseOperation_t  opB,
+                        const void*          alpha,
+                        cusparseSpMatDescr_t matA,
+                        cusparseDnMatDescr_t matB,
+                        const void*          beta,
+                        cusparseDnMatDescr_t matC,
+                        cudaDataType         computeType,
+                        cusparseSpMMAlg_t    alg,
+                        size_t*              bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMM_preprocess(cusparseHandle_t      handle,
+                        cusparseOperation_t   opA,
+                        cusparseOperation_t   opB,
+                        const void*           alpha,
+                        cusparseSpMatDescr_t  matA,
+                        cusparseDnMatDescr_t  matB,
+                        const void*           beta,
+                        cusparseDnMatDescr_t  matC,
+                        cudaDataType          computeType,
+                        cusparseSpMMAlg_t     alg,
+                        void*                 externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMM(cusparseHandle_t     handle,
+             cusparseOperation_t  opA,
+             cusparseOperation_t  opB,
+             const void*          alpha,
+             cusparseSpMatDescr_t matA,
+             cusparseDnMatDescr_t matB,
+             const void*          beta,
+             cusparseDnMatDescr_t matC,
+             cudaDataType         computeType,
+             cusparseSpMMAlg_t    alg,
+             void*                externalBuffer);
+
+// #############################################################################
+// # SPARSE MATRIX - SPARSE MATRIX MULTIPLICATION (SpGEMM)
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_SPGEMM_DEFAULT                 = 0,
+    CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC    = 1,
+    CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC = 2
+} cusparseSpGEMMAlg_t;
+
+struct cusparseSpGEMMDescr;
+typedef struct cusparseSpGEMMDescr* cusparseSpGEMMDescr_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_createDescr(cusparseSpGEMMDescr_t* descr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_destroyDescr(cusparseSpGEMMDescr_t descr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_workEstimation(cusparseHandle_t      handle,
+                              cusparseOperation_t   opA,
+                              cusparseOperation_t   opB,
+                              const void*           alpha,
+                              cusparseSpMatDescr_t  matA,
+                              cusparseSpMatDescr_t  matB,
+                              const void*           beta,
+                              cusparseSpMatDescr_t  matC,
+                              cudaDataType          computeType,
+                              cusparseSpGEMMAlg_t   alg,
+                              cusparseSpGEMMDescr_t spgemmDescr,
+                              size_t*               bufferSize1,
+                              void*                 externalBuffer1);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_compute(cusparseHandle_t      handle,
+                       cusparseOperation_t   opA,
+                       cusparseOperation_t   opB,
+                       const void*           alpha,
+                       cusparseSpMatDescr_t  matA,
+                       cusparseSpMatDescr_t  matB,
+                       const void*           beta,
+                       cusparseSpMatDescr_t  matC,
+                       cudaDataType          computeType,
+                       cusparseSpGEMMAlg_t   alg,
+                       cusparseSpGEMMDescr_t spgemmDescr,
+                       size_t*               bufferSize2,
+                       void*                 externalBuffer2);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_copy(cusparseHandle_t      handle,
+                    cusparseOperation_t   opA,
+                    cusparseOperation_t   opB,
+                    const void*           alpha,
+                    cusparseSpMatDescr_t  matA,
+                    cusparseSpMatDescr_t  matB,
+                    const void*           beta,
+                    cusparseSpMatDescr_t  matC,
+                    cudaDataType          computeType,
+                    cusparseSpGEMMAlg_t   alg,
+                    cusparseSpGEMMDescr_t spgemmDescr);
+
+// #############################################################################
+// # SPARSE MATRIX - SPARSE MATRIX MULTIPLICATION (SpGEMM) STRUCTURE REUSE
+// #############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMMreuse_workEstimation(cusparseHandle_t      handle,
+                                   cusparseOperation_t   opA,
+                                   cusparseOperation_t   opB,
+                                   cusparseSpMatDescr_t  matA,
+                                   cusparseSpMatDescr_t  matB,
+                                   cusparseSpMatDescr_t  matC,
+                                   cusparseSpGEMMAlg_t   alg,
+                                   cusparseSpGEMMDescr_t spgemmDescr,
+                                   size_t*               bufferSize1,
+                                   void*                 externalBuffer1);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMMreuse_nnz(cusparseHandle_t      handle,
+                        cusparseOperation_t   opA,
+                        cusparseOperation_t   opB,
+                        cusparseSpMatDescr_t  matA,
+                        cusparseSpMatDescr_t  matB,
+                        cusparseSpMatDescr_t  matC,
+                        cusparseSpGEMMAlg_t   alg,
+                        cusparseSpGEMMDescr_t spgemmDescr,
+                        size_t*               bufferSize2,
+                        void*                 externalBuffer2,
+                        size_t*               bufferSize3,
+                        void*                 externalBuffer3,
+                        size_t*               bufferSize4,
+                        void*                 externalBuffer4);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMMreuse_copy(cusparseHandle_t      handle,
+                         cusparseOperation_t   opA,
+                         cusparseOperation_t   opB,
+                         cusparseSpMatDescr_t  matA,
+                         cusparseSpMatDescr_t  matB,
+                         cusparseSpMatDescr_t  matC,
+                         cusparseSpGEMMAlg_t   alg,
+                         cusparseSpGEMMDescr_t spgemmDescr,
+                         size_t*               bufferSize5,
+                         void*                 externalBuffer5);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMMreuse_compute(cusparseHandle_t      handle,
+                            cusparseOperation_t   opA,
+                            cusparseOperation_t   opB,
+                            const void*           alpha,
+                            cusparseSpMatDescr_t  matA,
+                            cusparseSpMatDescr_t  matB,
+                            const void*           beta,
+                            cusparseSpMatDescr_t  matC,
+                            cudaDataType          computeType,
+                            cusparseSpGEMMAlg_t   alg,
+                            cusparseSpGEMMDescr_t spgemmDescr);
+
+// #############################################################################
+// # SAMPLED DENSE-DENSE MATRIX MULTIPLICATION
+// #############################################################################
+
+CUSPARSE_DEPRECATED(cusparseSDDMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseConstrainedGeMM(cusparseHandle_t     handle,
+                        cusparseOperation_t  opA,
+                        cusparseOperation_t  opB,
+                        const void*          alpha,
+                        cusparseDnMatDescr_t matA,
+                        cusparseDnMatDescr_t matB,
+                        const void*          beta,
+                        cusparseSpMatDescr_t matC,
+                        cudaDataType         computeType,
+                        void*                externalBuffer);
+
+CUSPARSE_DEPRECATED(cusparseSDDMM)
+cusparseStatus_t CUSPARSEAPI
+cusparseConstrainedGeMM_bufferSize(cusparseHandle_t     handle,
+                                   cusparseOperation_t  opA,
+                                   cusparseOperation_t  opB,
+                                   const void*          alpha,
+                                   cusparseDnMatDescr_t matA,
+                                   cusparseDnMatDescr_t matB,
+                                   const void*          beta,
+                                   cusparseSpMatDescr_t matC,
+                                   cudaDataType         computeType,
+                                   size_t*              bufferSize);
+
+typedef enum {
+    CUSPARSE_SDDMM_ALG_DEFAULT = 0
+} cusparseSDDMMAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSDDMM_bufferSize(cusparseHandle_t     handle,
+                         cusparseOperation_t  opA,
+                         cusparseOperation_t  opB,
+                         const void*          alpha,
+                         cusparseDnMatDescr_t matA,
+                         cusparseDnMatDescr_t matB,
+                         const void*          beta,
+                         cusparseSpMatDescr_t matC,
+                         cudaDataType         computeType,
+                         cusparseSDDMMAlg_t   alg,
+                         size_t*              bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSDDMM_preprocess(cusparseHandle_t     handle,
+                         cusparseOperation_t  opA,
+                         cusparseOperation_t  opB,
+                         const void*          alpha,
+                         cusparseDnMatDescr_t matA,
+                         cusparseDnMatDescr_t matB,
+                         const void*          beta,
+                         cusparseSpMatDescr_t matC,
+                         cudaDataType         computeType,
+                         cusparseSDDMMAlg_t   alg,
+                         void*                externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSDDMM(cusparseHandle_t     handle,
+              cusparseOperation_t  opA,
+              cusparseOperation_t  opB,
+              const void*          alpha,
+              cusparseDnMatDescr_t matA,
+              cusparseDnMatDescr_t matB,
+              const void*          beta,
+              cusparseSpMatDescr_t matC,
+              cudaDataType         computeType,
+              cusparseSDDMMAlg_t   alg,
+              void*                externalBuffer);
+
+// #############################################################################
+// # GENERIC APIs WITH CUSTOM OPERATORS (PREVIEW)
+// #############################################################################
+
+struct cusparseSpMMOpPlan;
+typedef struct cusparseSpMMOpPlan* cusparseSpMMOpPlan_t;
+
+typedef enum {
+    CUSPARSE_SPMM_OP_ALG_DEFAULT
+} cusparseSpMMOpAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMMOp_createPlan(cusparseHandle_t      handle,
+                          cusparseSpMMOpPlan_t* plan,
+                          cusparseOperation_t   opA,
+                          cusparseOperation_t   opB,
+                          cusparseSpMatDescr_t  matA,
+                          cusparseDnMatDescr_t  matB,
+                          cusparseDnMatDescr_t  matC,
+                          cudaDataType          computeType,
+                          cusparseSpMMOpAlg_t   alg,
+                          const void*           addOperationNvvmBuffer,
+                          size_t                addOperationBufferSize,
+                          const void*           mulOperationNvvmBuffer,
+                          size_t                mulOperationBufferSize,
+                          const void*           epilogueNvvmBuffer,
+                          size_t                epilogueBufferSize,
+                          size_t*               SpMMWorkspaceSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMMOp(cusparseSpMMOpPlan_t plan,
+               void*                externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMMOp_destroyPlan(cusparseSpMMOpPlan_t plan);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif // defined(__cplusplus)
+
+#undef CUSPARSE_DEPRECATED
+#undef CUSPARSE_PREVIEW
+
+#endif // !defined(CUSPARSE_H_)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10c57451644fd7d608d0090a9af6b54e36fbe445
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ece81b45e771af99cf98d4237b4728b4c3dcff6b
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h
new file mode 100644
index 0000000000000000000000000000000000000000..aae5625bdb8850e045ca04711ff5da845511fbf2
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h
@@ -0,0 +1,448 @@
+/*************************************************************************
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_H_
+#define NCCL_H_
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#if CUDART_VERSION >= 11000
+#include <cuda_bf16.h>
+#endif
+
+#define NCCL_MAJOR 2
+#define NCCL_MINOR 20
+#define NCCL_PATCH 5
+#define NCCL_SUFFIX ""
+
+#define NCCL_VERSION_CODE 22005
+#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <limits.h>
+/* Opaque handle to communicator */
+typedef struct ncclComm* ncclComm_t;
+#define NCCL_COMM_NULL NULL
+
+#define NCCL_UNIQUE_ID_BYTES 128
+typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+
+/* Error type */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6,
+               ncclInProgress              =  7,
+               ncclNumResults              =  8 } ncclResult_t;
+
+#define NCCL_CONFIG_UNDEF_INT INT_MIN
+#define NCCL_CONFIG_UNDEF_PTR NULL
+#define NCCL_SPLIT_NOCOLOR -1
+
+/* Communicator configuration. Users can assign value to attributes to specify the
+ * behavior of a communicator. */
+typedef struct ncclConfig_v21700 {
+  /* attributes that users should never touch. */
+  size_t size;
+  unsigned int magic;
+  unsigned int version;
+  /* attributes that users are able to customize. */
+  int blocking;
+  int cgaClusterSize;
+  int minCTAs;
+  int maxCTAs;
+  const char *netName;
+  int splitShare;
+} ncclConfig_t;
+
+/* Config initializer must be assigned to initialize config structure when it is created.
+ * Not initialized config will result in NCCL error. */
+#define NCCL_CONFIG_INITIALIZER {                                       \
+  sizeof(ncclConfig_t), /* size */                                      \
+  0xcafebeef,           /* magic */                                     \
+  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
+  NCCL_CONFIG_UNDEF_INT,                    /* blocking */              \
+  NCCL_CONFIG_UNDEF_INT,                    /* cgaClusterSize */        \
+  NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
+  NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
+  NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
+  NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
+}
+
+/* NCCL malloc and free function for all types of NCCL optimizations
+ * (e.g. user buffer registration). The actual allocated size might
+ * be larger than requested due to granularity requirement. */
+ncclResult_t  ncclMemAlloc(void** ptr, size_t size);
+ncclResult_t pncclMemAlloc(void** ptr, size_t size);
+
+ncclResult_t  ncclMemFree(void *ptr);
+ncclResult_t pncclMemFree(void *ptr);
+
+/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
+ * This integer is coded with the MAJOR, MINOR and PATCH level of the
+ * NCCL library
+ */
+ncclResult_t  ncclGetVersion(int *version);
+ncclResult_t pncclGetVersion(int *version);
+
+/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
+ * called once and the Id should be distributed to all ranks in the
+ * communicator before calling ncclCommInitRank. */
+ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
+ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
+
+/* Create a new communicator (multi thread/process version) with a configuration
+ * set by users. */
+ncclResult_t  ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+
+/* Creates a new communicator (multi thread/process version).
+ * rank must be between 0 and nranks-1 and unique within a communicator clique.
+ * Each rank is associated to a CUDA device, which has to be set before calling
+ * ncclCommInitRank.
+ * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
+ * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
+ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+
+/* Creates a clique of communicators (single process version).
+ * This is a convenience function to create a single-process communicator clique.
+ * Returns an array of ndev newly initialized communicators in comm.
+ * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
+ * If devlist is NULL, the first ndev CUDA devices are used.
+ * Order of devlist defines user-order of processors within the communicator. */
+ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+
+/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
+ * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
+ * when the communicator is globally quiescent and related resources are freed; then,
+ * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
+ * itself) without blocking. */
+ncclResult_t  ncclCommFinalize(ncclComm_t comm);
+ncclResult_t pncclCommFinalize(ncclComm_t comm);
+
+/* Frees local resources associated with communicator object. */
+ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+ncclResult_t pncclCommDestroy(ncclComm_t comm);
+
+/* Frees resources associated with communicator object and aborts any operations
+ * that might still be running on the device. */
+ncclResult_t  ncclCommAbort(ncclComm_t comm);
+ncclResult_t pncclCommAbort(ncclComm_t comm);
+
+/* Creates one or more communicators from an existing one.
+ * Ranks with the same color will end up in the same communicator.
+ * Within the new communicator, key will be used to order ranks.
+ * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
+ * and will therefore return a NULL communicator.
+ * If config is NULL, the new communicator will inherit the original communicator's
+ * configuration*/
+ncclResult_t  ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+
+/* Returns a string for each error code. */
+const char*  ncclGetErrorString(ncclResult_t result);
+const char* pncclGetErrorString(ncclResult_t result);
+
+/* Returns a human-readable message of the last error that occurred. */
+const char*  ncclGetLastError(ncclComm_t comm);
+const char* pncclGetLastError(ncclComm_t comm);
+
+/* Checks whether the comm has encountered any asynchronous errors */
+ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+
+/* Gets the number of ranks in the communicator clique. */
+ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
+ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
+
+/* Returns the cuda device number associated with the communicator. */
+ncclResult_t  ncclCommCuDevice(const ncclComm_t comm, int* device);
+ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
+
+/* Returns the user-ordered "rank" associated with the communicator. */
+ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
+ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
+
+
+/* Register CUDA buffer for zero-copy operation */
+ncclResult_t  ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+
+/* Deregister CUDA buffer */
+ncclResult_t  ncclCommDeregister(const ncclComm_t comm, void* handle);
+ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
+
+/* Reduction operation selector */
+typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
+typedef enum { ncclSum        = 0,
+               ncclProd       = 1,
+               ncclMax        = 2,
+               ncclMin        = 3,
+               ncclAvg        = 4,
+               /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
+                * serves as the least possible value for dynamic ncclRedOp_t's
+                * as constructed by ncclRedOpCreate*** functions. */
+               ncclNumOps     = 5,
+               /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
+                * It is defined to be the largest signed value (since compilers
+                * are permitted to use signed enums) that won't grow
+                * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
+                * maintain ABI compatibility. */
+               ncclMaxRedOp   = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
+             } ncclRedOp_t;
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+               ncclBfloat16   = 9,
+               ncclNumTypes   = 10
+#else
+               ncclNumTypes   = 9
+#endif
+} ncclDataType_t;
+
+/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
+typedef enum {
+  /* ncclScalarDevice: The scalar is in device-visible memory and will be
+   * dereferenced while the collective is running. */
+  ncclScalarDevice = 0,
+
+  /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
+   * dereferenced before the ncclRedOpCreate***() function returns. */
+  ncclScalarHostImmediate = 1
+} ncclScalarResidence_t;
+
+/*
+ * ncclRedOpCreatePreMulSum
+ *
+ * Creates a new reduction operator which pre-multiplies input values by a given
+ * scalar locally before reducing them with peer values via summation. For use
+ * only with collectives launched against *comm* and *datatype*. The
+ * *residence* argument indicates how/when the memory pointed to by *scalar*
+ * will be dereferenced. Upon return, the newly created operator's handle
+ * is stored in *op*.
+ */
+ncclResult_t  ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+
+/*
+ * ncclRedOpDestroy
+ *
+ * Destroys the reduction operator *op*. The operator must have been created by
+ * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
+ * destroyed as soon as the last NCCL function which is given that operator returns.
+ */
+ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
+ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
+
+/*
+ * Collective communication operations
+ *
+ * Collective communication operations must be called separately for each
+ * communicator in a communicator clique.
+ *
+ * They return when operations have been enqueued on the CUDA stream.
+ *
+ * Since they may perform inter-CPU synchronization, each call has to be done
+ * from a different thread or process, or need to use Group Semantics (see
+ * below).
+ */
+
+/*
+ * Reduce
+ *
+ * Reduces data arrays of length count in sendbuff into recvbuff using op
+ * operation.
+ * recvbuff may be NULL on all calls except for root device.
+ * root is the rank (not the CUDA device) where data will reside after the
+ * operation is complete.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * (deprecated) Broadcast (in-place)
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * This operation is implicitely in place.
+ */
+ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Broadcast
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * All-Reduce
+ *
+ * Reduces data arrays of length count in sendbuff using op operation, and
+ * leaves identical copies of result on each recvbuff.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Reduce-Scatter
+ *
+ * Reduces data in sendbuff using op operation and leaves reduced result
+ * scattered over the devices so that recvbuff on rank i will contain the i-th
+ * block of the result.
+ * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
+ * should have a size of at least nranks*recvcount elements.
+ *
+ * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
+ */
+ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+
+/*
+ * All-Gather
+ *
+ * Each device gathers sendcount values from other GPUs into recvbuff,
+ * receiving data from rank i at offset i*sendcount.
+ * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
+ * should have a size of at least nranks*sendcount elements.
+ *
+ * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
+ */
+ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Send
+ *
+ * Send data from sendbuff to rank peer.
+ *
+ * Rank peer needs to call ncclRecv with the same datatype and the same count from this
+ * rank.
+ *
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+ * ncclGroupEnd section.
+ */
+ncclResult_t  ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Receive
+ *
+ * Receive data from rank peer into recvbuff.
+ *
+ * Rank peer needs to call ncclSend with the same datatype and the same count to this
+ * rank.
+ *
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+ * ncclGroupEnd section.
+ */
+ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t  ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Group semantics
+ *
+ * When managing multiple GPUs from a single thread, and since NCCL collective
+ * calls may perform inter-CPU synchronization, we need to "group" calls for
+ * different ranks/devices into a single call.
+ *
+ * Grouping NCCL calls as being part of the same collective operation is done
+ * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
+ * collective calls until the ncclGroupEnd call, which will wait for all calls
+ * to be complete. Note that for collective communication, ncclGroupEnd only
+ * guarantees that the operations are enqueued on the streams, not that
+ * the operation is effectively done.
+ *
+ * Both collective communication and ncclCommInitRank can be used in conjunction
+ * of ncclGroupStart/ncclGroupEnd, but not together.
+ *
+ * Group semantics also allow to fuse multiple operations on the same device
+ * to improve performance (for aggregated collective calls), or to permit
+ * concurrent progress of multiple send/receive operations.
+ */
+
+/*
+ * Group Start
+ *
+ * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
+ * a single NCCL operation. Nothing will be started on the CUDA stream until
+ * ncclGroupEnd.
+ */
+ncclResult_t  ncclGroupStart();
+ncclResult_t pncclGroupStart();
+
+/*
+ * Group End
+ *
+ * End a group call. Start a fused NCCL operation consisting of all calls since
+ * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
+ * need to be called after ncclGroupEnd.
+ */
+ncclResult_t  ncclGroupEnd();
+ncclResult_t pncclGroupEnd();
+
+/* Register CUDA buffer for zero-copy operation */
+ncclResult_t  ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+
+/* Deregister CUDA buffer */
+ncclResult_t  ncclCommDeregister(const ncclComm_t comm, void* handle);
+ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
+#endif // end include guard
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/_cmd.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/_cmd.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c84208a5d87511cc4a63dcd9c647ac75c6f4475
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/_cmd.py
@@ -0,0 +1,70 @@
+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import logging
+from argparse import ArgumentParser
+from typing import TYPE_CHECKING
+
+from pip._vendor import requests
+
+from pip._vendor.cachecontrol.adapter import CacheControlAdapter
+from pip._vendor.cachecontrol.cache import DictCache
+from pip._vendor.cachecontrol.controller import logger
+
+if TYPE_CHECKING:
+    from argparse import Namespace
+
+    from pip._vendor.cachecontrol.controller import CacheController
+
+
+def setup_logging() -> None:
+    logger.setLevel(logging.DEBUG)
+    handler = logging.StreamHandler()
+    logger.addHandler(handler)
+
+
+def get_session() -> requests.Session:
+    adapter = CacheControlAdapter(
+        DictCache(), cache_etags=True, serializer=None, heuristic=None
+    )
+    sess = requests.Session()
+    sess.mount("http://", adapter)
+    sess.mount("https://", adapter)
+
+    sess.cache_controller = adapter.controller  # type: ignore[attr-defined]
+    return sess
+
+
+def get_args() -> Namespace:
+    parser = ArgumentParser()
+    parser.add_argument("url", help="The URL to try and cache")
+    return parser.parse_args()
+
+
+def main() -> None:
+    args = get_args()
+    sess = get_session()
+
+    # Make a request to get a response
+    resp = sess.get(args.url)
+
+    # Turn on logging
+    setup_logging()
+
+    # try setting the cache
+    cache_controller: CacheController = (
+        sess.cache_controller  # type: ignore[attr-defined]
+    )
+    cache_controller.cache_response(resp.request, resp.raw)
+
+    # Now try to get it
+    if cache_controller.cached_request(resp.request):
+        print("Cached!")
+    else:
+        print("Not cached :(")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/adapter.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/adapter.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbb4ecc88762f8a3602417e2bab446f97b319196
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/adapter.py
@@ -0,0 +1,161 @@
+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import functools
+import types
+import zlib
+from typing import TYPE_CHECKING, Any, Collection, Mapping
+
+from pip._vendor.requests.adapters import HTTPAdapter
+
+from pip._vendor.cachecontrol.cache import DictCache
+from pip._vendor.cachecontrol.controller import PERMANENT_REDIRECT_STATUSES, CacheController
+from pip._vendor.cachecontrol.filewrapper import CallbackFileWrapper
+
+if TYPE_CHECKING:
+    from pip._vendor.requests import PreparedRequest, Response
+    from pip._vendor.urllib3 import HTTPResponse
+
+    from pip._vendor.cachecontrol.cache import BaseCache
+    from pip._vendor.cachecontrol.heuristics import BaseHeuristic
+    from pip._vendor.cachecontrol.serialize import Serializer
+
+
+class CacheControlAdapter(HTTPAdapter):
+    invalidating_methods = {"PUT", "PATCH", "DELETE"}
+
+    def __init__(
+        self,
+        cache: BaseCache | None = None,
+        cache_etags: bool = True,
+        controller_class: type[CacheController] | None = None,
+        serializer: Serializer | None = None,
+        heuristic: BaseHeuristic | None = None,
+        cacheable_methods: Collection[str] | None = None,
+        *args: Any,
+        **kw: Any,
+    ) -> None:
+        super().__init__(*args, **kw)
+        self.cache = DictCache() if cache is None else cache
+        self.heuristic = heuristic
+        self.cacheable_methods = cacheable_methods or ("GET",)
+
+        controller_factory = controller_class or CacheController
+        self.controller = controller_factory(
+            self.cache, cache_etags=cache_etags, serializer=serializer
+        )
+
+    def send(
+        self,
+        request: PreparedRequest,
+        stream: bool = False,
+        timeout: None | float | tuple[float, float] | tuple[float, None] = None,
+        verify: bool | str = True,
+        cert: (None | bytes | str | tuple[bytes | str, bytes | str]) = None,
+        proxies: Mapping[str, str] | None = None,
+        cacheable_methods: Collection[str] | None = None,
+    ) -> Response:
+        """
+        Send a request. Use the request information to see if it
+        exists in the cache and cache the response if we need to and can.
+        """
+        cacheable = cacheable_methods or self.cacheable_methods
+        if request.method in cacheable:
+            try:
+                cached_response = self.controller.cached_request(request)
+            except zlib.error:
+                cached_response = None
+            if cached_response:
+                return self.build_response(request, cached_response, from_cache=True)
+
+            # check for etags and add headers if appropriate
+            request.headers.update(self.controller.conditional_headers(request))
+
+        resp = super().send(request, stream, timeout, verify, cert, proxies)
+
+        return resp
+
+    def build_response(
+        self,
+        request: PreparedRequest,
+        response: HTTPResponse,
+        from_cache: bool = False,
+        cacheable_methods: Collection[str] | None = None,
+    ) -> Response:
+        """
+        Build a response by making a request or using the cache.
+
+        This will end up calling send and returning a potentially
+        cached response
+        """
+        cacheable = cacheable_methods or self.cacheable_methods
+        if not from_cache and request.method in cacheable:
+            # Check for any heuristics that might update headers
+            # before trying to cache.
+            if self.heuristic:
+                response = self.heuristic.apply(response)
+
+            # apply any expiration heuristics
+            if response.status == 304:
+                # We must have sent an ETag request. This could mean
+                # that we've been expired already or that we simply
+                # have an etag. In either case, we want to try and
+                # update the cache if that is the case.
+                cached_response = self.controller.update_cached_response(
+                    request, response
+                )
+
+                if cached_response is not response:
+                    from_cache = True
+
+                # We are done with the server response, read a
+                # possible response body (compliant servers will
+                # not return one, but we cannot be 100% sure) and
+                # release the connection back to the pool.
+                response.read(decode_content=False)
+                response.release_conn()
+
+                response = cached_response
+
+            # We always cache the 301 responses
+            elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
+                self.controller.cache_response(request, response)
+            else:
+                # Wrap the response file with a wrapper that will cache the
+                #   response when the stream has been consumed.
+                response._fp = CallbackFileWrapper(  # type: ignore[assignment]
+                    response._fp,  # type: ignore[arg-type]
+                    functools.partial(
+                        self.controller.cache_response, request, response
+                    ),
+                )
+                if response.chunked:
+                    super_update_chunk_length = response._update_chunk_length
+
+                    def _update_chunk_length(self: HTTPResponse) -> None:
+                        super_update_chunk_length()
+                        if self.chunk_left == 0:
+                            self._fp._close()  # type: ignore[union-attr]
+
+                    response._update_chunk_length = types.MethodType(  # type: ignore[method-assign]
+                        _update_chunk_length, response
+                    )
+
+        resp: Response = super().build_response(request, response)  # type: ignore[no-untyped-call]
+
+        # See if we should invalidate the cache.
+        if request.method in self.invalidating_methods and resp.ok:
+            assert request.url is not None
+            cache_url = self.controller.cache_url(request.url)
+            self.cache.delete(cache_url)
+
+        # Give the request a from_cache attr to let people use it
+        resp.from_cache = from_cache  # type: ignore[attr-defined]
+
+        return resp
+
+    def close(self) -> None:
+        self.cache.close()
+        super().close()  # type: ignore[no-untyped-call]
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/cache.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..3293b0057c789e4bfeafb926cb9647f280ce309e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/cache.py
@@ -0,0 +1,74 @@
+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+The cache object API for implementing caches. The default is a thread
+safe in-memory dictionary.
+"""
+from __future__ import annotations
+
+from threading import Lock
+from typing import IO, TYPE_CHECKING, MutableMapping
+
+if TYPE_CHECKING:
+    from datetime import datetime
+
+
+class BaseCache:
+    def get(self, key: str) -> bytes | None:
+        raise NotImplementedError()
+
+    def set(
+        self, key: str, value: bytes, expires: int | datetime | None = None
+    ) -> None:
+        raise NotImplementedError()
+
+    def delete(self, key: str) -> None:
+        raise NotImplementedError()
+
+    def close(self) -> None:
+        pass
+
+
+class DictCache(BaseCache):
+    def __init__(self, init_dict: MutableMapping[str, bytes] | None = None) -> None:
+        self.lock = Lock()
+        self.data = init_dict or {}
+
+    def get(self, key: str) -> bytes | None:
+        return self.data.get(key, None)
+
+    def set(
+        self, key: str, value: bytes, expires: int | datetime | None = None
+    ) -> None:
+        with self.lock:
+            self.data.update({key: value})
+
+    def delete(self, key: str) -> None:
+        with self.lock:
+            if key in self.data:
+                self.data.pop(key)
+
+
+class SeparateBodyBaseCache(BaseCache):
+    """
+    In this variant, the body is not stored mixed in with the metadata, but is
+    passed in (as a bytes-like object) in a separate call to ``set_body()``.
+
+    That is, the expected interaction pattern is::
+
+        cache.set(key, serialized_metadata)
+        cache.set_body(key)
+
+    Similarly, the body should be loaded separately via ``get_body()``.
+    """
+
+    def set_body(self, key: str, body: bytes) -> None:
+        raise NotImplementedError()
+
+    def get_body(self, key: str) -> IO[bytes] | None:
+        """
+        Return the body as file-like object.
+        """
+        raise NotImplementedError()
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/controller.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/controller.py
new file mode 100644
index 0000000000000000000000000000000000000000..d7dd86e5f702ea0dd156b9cb8217784dbc5488eb
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/controller.py
@@ -0,0 +1,499 @@
+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+The httplib2 algorithms ported for use with requests.
+"""
+from __future__ import annotations
+
+import calendar
+import logging
+import re
+import time
+from email.utils import parsedate_tz
+from typing import TYPE_CHECKING, Collection, Mapping
+
+from pip._vendor.requests.structures import CaseInsensitiveDict
+
+from pip._vendor.cachecontrol.cache import DictCache, SeparateBodyBaseCache
+from pip._vendor.cachecontrol.serialize import Serializer
+
+if TYPE_CHECKING:
+    from typing import Literal
+
+    from pip._vendor.requests import PreparedRequest
+    from pip._vendor.urllib3 import HTTPResponse
+
+    from pip._vendor.cachecontrol.cache import BaseCache
+
+logger = logging.getLogger(__name__)
+
+URI = re.compile(r"^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?")
+
+PERMANENT_REDIRECT_STATUSES = (301, 308)
+
+
+def parse_uri(uri: str) -> tuple[str, str, str, str, str]:
+    """Parses a URI using the regex given in Appendix B of RFC 3986.
+
+    (scheme, authority, path, query, fragment) = parse_uri(uri)
+    """
+    match = URI.match(uri)
+    assert match is not None
+    groups = match.groups()
+    return (groups[1], groups[3], groups[4], groups[6], groups[8])
+
+
+class CacheController:
+    """An interface to see if request should cached or not."""
+
+    def __init__(
+        self,
+        cache: BaseCache | None = None,
+        cache_etags: bool = True,
+        serializer: Serializer | None = None,
+        status_codes: Collection[int] | None = None,
+    ):
+        self.cache = DictCache() if cache is None else cache
+        self.cache_etags = cache_etags
+        self.serializer = serializer or Serializer()
+        self.cacheable_status_codes = status_codes or (200, 203, 300, 301, 308)
+
+    @classmethod
+    def _urlnorm(cls, uri: str) -> str:
+        """Normalize the URL to create a safe key for the cache"""
+        (scheme, authority, path, query, fragment) = parse_uri(uri)
+        if not scheme or not authority:
+            raise Exception("Only absolute URIs are allowed. uri = %s" % uri)
+
+        scheme = scheme.lower()
+        authority = authority.lower()
+
+        if not path:
+            path = "/"
+
+        # Could do syntax based normalization of the URI before
+        # computing the digest. See Section 6.2.2 of Std 66.
+        request_uri = query and "?".join([path, query]) or path
+        defrag_uri = scheme + "://" + authority + request_uri
+
+        return defrag_uri
+
+    @classmethod
+    def cache_url(cls, uri: str) -> str:
+        return cls._urlnorm(uri)
+
+    def parse_cache_control(self, headers: Mapping[str, str]) -> dict[str, int | None]:
+        known_directives = {
+            # https://tools.ietf.org/html/rfc7234#section-5.2
+            "max-age": (int, True),
+            "max-stale": (int, False),
+            "min-fresh": (int, True),
+            "no-cache": (None, False),
+            "no-store": (None, False),
+            "no-transform": (None, False),
+            "only-if-cached": (None, False),
+            "must-revalidate": (None, False),
+            "public": (None, False),
+            "private": (None, False),
+            "proxy-revalidate": (None, False),
+            "s-maxage": (int, True),
+        }
+
+        cc_headers = headers.get("cache-control", headers.get("Cache-Control", ""))
+
+        retval: dict[str, int | None] = {}
+
+        for cc_directive in cc_headers.split(","):
+            if not cc_directive.strip():
+                continue
+
+            parts = cc_directive.split("=", 1)
+            directive = parts[0].strip()
+
+            try:
+                typ, required = known_directives[directive]
+            except KeyError:
+                logger.debug("Ignoring unknown cache-control directive: %s", directive)
+                continue
+
+            if not typ or not required:
+                retval[directive] = None
+            if typ:
+                try:
+                    retval[directive] = typ(parts[1].strip())
+                except IndexError:
+                    if required:
+                        logger.debug(
+                            "Missing value for cache-control " "directive: %s",
+                            directive,
+                        )
+                except ValueError:
+                    logger.debug(
+                        "Invalid value for cache-control directive " "%s, must be %s",
+                        directive,
+                        typ.__name__,
+                    )
+
+        return retval
+
+    def _load_from_cache(self, request: PreparedRequest) -> HTTPResponse | None:
+        """
+        Load a cached response, or return None if it's not available.
+        """
+        # We do not support caching of partial content: so if the request contains a
+        # Range header then we don't want to load anything from the cache.
+        if "Range" in request.headers:
+            return None
+
+        cache_url = request.url
+        assert cache_url is not None
+        cache_data = self.cache.get(cache_url)
+        if cache_data is None:
+            logger.debug("No cache entry available")
+            return None
+
+        if isinstance(self.cache, SeparateBodyBaseCache):
+            body_file = self.cache.get_body(cache_url)
+        else:
+            body_file = None
+
+        result = self.serializer.loads(request, cache_data, body_file)
+        if result is None:
+            logger.warning("Cache entry deserialization failed, entry ignored")
+        return result
+
+    def cached_request(self, request: PreparedRequest) -> HTTPResponse | Literal[False]:
+        """
+        Return a cached response if it exists in the cache, otherwise
+        return False.
+        """
+        assert request.url is not None
+        cache_url = self.cache_url(request.url)
+        logger.debug('Looking up "%s" in the cache', cache_url)
+        cc = self.parse_cache_control(request.headers)
+
+        # Bail out if the request insists on fresh data
+        if "no-cache" in cc:
+            logger.debug('Request header has "no-cache", cache bypassed')
+            return False
+
+        if "max-age" in cc and cc["max-age"] == 0:
+            logger.debug('Request header has "max_age" as 0, cache bypassed')
+            return False
+
+        # Check whether we can load the response from the cache:
+        resp = self._load_from_cache(request)
+        if not resp:
+            return False
+
+        # If we have a cached permanent redirect, return it immediately. We
+        # don't need to test our response for other headers b/c it is
+        # intrinsically "cacheable" as it is Permanent.
+        #
+        # See:
+        #   https://tools.ietf.org/html/rfc7231#section-6.4.2
+        #
+        # Client can try to refresh the value by repeating the request
+        # with cache busting headers as usual (ie no-cache).
+        if int(resp.status) in PERMANENT_REDIRECT_STATUSES:
+            msg = (
+                "Returning cached permanent redirect response "
+                "(ignoring date and etag information)"
+            )
+            logger.debug(msg)
+            return resp
+
+        headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers)
+        if not headers or "date" not in headers:
+            if "etag" not in headers:
+                # Without date or etag, the cached response can never be used
+                # and should be deleted.
+                logger.debug("Purging cached response: no date or etag")
+                self.cache.delete(cache_url)
+            logger.debug("Ignoring cached response: no date")
+            return False
+
+        now = time.time()
+        time_tuple = parsedate_tz(headers["date"])
+        assert time_tuple is not None
+        date = calendar.timegm(time_tuple[:6])
+        current_age = max(0, now - date)
+        logger.debug("Current age based on date: %i", current_age)
+
+        # TODO: There is an assumption that the result will be a
+        #       urllib3 response object. This may not be best since we
+        #       could probably avoid instantiating or constructing the
+        #       response until we know we need it.
+        resp_cc = self.parse_cache_control(headers)
+
+        # determine freshness
+        freshness_lifetime = 0
+
+        # Check the max-age pragma in the cache control header
+        max_age = resp_cc.get("max-age")
+        if max_age is not None:
+            freshness_lifetime = max_age
+            logger.debug("Freshness lifetime from max-age: %i", freshness_lifetime)
+
+        # If there isn't a max-age, check for an expires header
+        elif "expires" in headers:
+            expires = parsedate_tz(headers["expires"])
+            if expires is not None:
+                expire_time = calendar.timegm(expires[:6]) - date
+                freshness_lifetime = max(0, expire_time)
+                logger.debug("Freshness lifetime from expires: %i", freshness_lifetime)
+
+        # Determine if we are setting freshness limit in the
+        # request. Note, this overrides what was in the response.
+        max_age = cc.get("max-age")
+        if max_age is not None:
+            freshness_lifetime = max_age
+            logger.debug(
+                "Freshness lifetime from request max-age: %i", freshness_lifetime
+            )
+
+        min_fresh = cc.get("min-fresh")
+        if min_fresh is not None:
+            # adjust our current age by our min fresh
+            current_age += min_fresh
+            logger.debug("Adjusted current age from min-fresh: %i", current_age)
+
+        # Return entry if it is fresh enough
+        if freshness_lifetime > current_age:
+            logger.debug('The response is "fresh", returning cached response')
+            logger.debug("%i > %i", freshness_lifetime, current_age)
+            return resp
+
+        # we're not fresh. If we don't have an Etag, clear it out
+        if "etag" not in headers:
+            logger.debug('The cached response is "stale" with no etag, purging')
+            self.cache.delete(cache_url)
+
+        # return the original handler
+        return False
+
+    def conditional_headers(self, request: PreparedRequest) -> dict[str, str]:
+        resp = self._load_from_cache(request)
+        new_headers = {}
+
+        if resp:
+            headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(resp.headers)
+
+            if "etag" in headers:
+                new_headers["If-None-Match"] = headers["ETag"]
+
+            if "last-modified" in headers:
+                new_headers["If-Modified-Since"] = headers["Last-Modified"]
+
+        return new_headers
+
+    def _cache_set(
+        self,
+        cache_url: str,
+        request: PreparedRequest,
+        response: HTTPResponse,
+        body: bytes | None = None,
+        expires_time: int | None = None,
+    ) -> None:
+        """
+        Store the data in the cache.
+        """
+        if isinstance(self.cache, SeparateBodyBaseCache):
+            # We pass in the body separately; just put a placeholder empty
+            # string in the metadata.
+            self.cache.set(
+                cache_url,
+                self.serializer.dumps(request, response, b""),
+                expires=expires_time,
+            )
+            # body is None can happen when, for example, we're only updating
+            # headers, as is the case in update_cached_response().
+            if body is not None:
+                self.cache.set_body(cache_url, body)
+        else:
+            self.cache.set(
+                cache_url,
+                self.serializer.dumps(request, response, body),
+                expires=expires_time,
+            )
+
+    def cache_response(
+        self,
+        request: PreparedRequest,
+        response: HTTPResponse,
+        body: bytes | None = None,
+        status_codes: Collection[int] | None = None,
+    ) -> None:
+        """
+        Algorithm for caching requests.
+
+        This assumes a requests Response object.
+        """
+        # From httplib2: Don't cache 206's since we aren't going to
+        #                handle byte range requests
+        cacheable_status_codes = status_codes or self.cacheable_status_codes
+        if response.status not in cacheable_status_codes:
+            logger.debug(
+                "Status code %s not in %s", response.status, cacheable_status_codes
+            )
+            return
+
+        response_headers: CaseInsensitiveDict[str] = CaseInsensitiveDict(
+            response.headers
+        )
+
+        if "date" in response_headers:
+            time_tuple = parsedate_tz(response_headers["date"])
+            assert time_tuple is not None
+            date = calendar.timegm(time_tuple[:6])
+        else:
+            date = 0
+
+        # If we've been given a body, our response has a Content-Length, that
+        # Content-Length is valid then we can check to see if the body we've
+        # been given matches the expected size, and if it doesn't we'll just
+        # skip trying to cache it.
+        if (
+            body is not None
+            and "content-length" in response_headers
+            and response_headers["content-length"].isdigit()
+            and int(response_headers["content-length"]) != len(body)
+        ):
+            return
+
+        cc_req = self.parse_cache_control(request.headers)
+        cc = self.parse_cache_control(response_headers)
+
+        assert request.url is not None
+        cache_url = self.cache_url(request.url)
+        logger.debug('Updating cache with response from "%s"', cache_url)
+
+        # Delete it from the cache if we happen to have it stored there
+        no_store = False
+        if "no-store" in cc:
+            no_store = True
+            logger.debug('Response header has "no-store"')
+        if "no-store" in cc_req:
+            no_store = True
+            logger.debug('Request header has "no-store"')
+        if no_store and self.cache.get(cache_url):
+            logger.debug('Purging existing cache entry to honor "no-store"')
+            self.cache.delete(cache_url)
+        if no_store:
+            return
+
+        # https://tools.ietf.org/html/rfc7234#section-4.1:
+        # A Vary header field-value of "*" always fails to match.
+        # Storing such a response leads to a deserialization warning
+        # during cache lookup and is not allowed to ever be served,
+        # so storing it can be avoided.
+        if "*" in response_headers.get("vary", ""):
+            logger.debug('Response header has "Vary: *"')
+            return
+
+        # If we've been given an etag, then keep the response
+        if self.cache_etags and "etag" in response_headers:
+            expires_time = 0
+            if response_headers.get("expires"):
+                expires = parsedate_tz(response_headers["expires"])
+                if expires is not None:
+                    expires_time = calendar.timegm(expires[:6]) - date
+
+            expires_time = max(expires_time, 14 * 86400)
+
+            logger.debug(f"etag object cached for {expires_time} seconds")
+            logger.debug("Caching due to etag")
+            self._cache_set(cache_url, request, response, body, expires_time)
+
+        # Add to the cache any permanent redirects. We do this before looking
+        # that the Date headers.
+        elif int(response.status) in PERMANENT_REDIRECT_STATUSES:
+            logger.debug("Caching permanent redirect")
+            self._cache_set(cache_url, request, response, b"")
+
+        # Add to the cache if the response headers demand it. If there
+        # is no date header then we can't do anything about expiring
+        # the cache.
+        elif "date" in response_headers:
+            time_tuple = parsedate_tz(response_headers["date"])
+            assert time_tuple is not None
+            date = calendar.timegm(time_tuple[:6])
+            # cache when there is a max-age > 0
+            max_age = cc.get("max-age")
+            if max_age is not None and max_age > 0:
+                logger.debug("Caching b/c date exists and max-age > 0")
+                expires_time = max_age
+                self._cache_set(
+                    cache_url,
+                    request,
+                    response,
+                    body,
+                    expires_time,
+                )
+
+            # If the request can expire, it means we should cache it
+            # in the meantime.
+            elif "expires" in response_headers:
+                if response_headers["expires"]:
+                    expires = parsedate_tz(response_headers["expires"])
+                    if expires is not None:
+                        expires_time = calendar.timegm(expires[:6]) - date
+                    else:
+                        expires_time = None
+
+                    logger.debug(
+                        "Caching b/c of expires header. expires in {} seconds".format(
+                            expires_time
+                        )
+                    )
+                    self._cache_set(
+                        cache_url,
+                        request,
+                        response,
+                        body,
+                        expires_time,
+                    )
+
+    def update_cached_response(
+        self, request: PreparedRequest, response: HTTPResponse
+    ) -> HTTPResponse:
+        """On a 304 we will get a new set of headers that we want to
+        update our cached value with, assuming we have one.
+
+        This should only ever be called when we've sent an ETag and
+        gotten a 304 as the response.
+        """
+        assert request.url is not None
+        cache_url = self.cache_url(request.url)
+        cached_response = self._load_from_cache(request)
+
+        if not cached_response:
+            # we didn't have a cached response
+            return response
+
+        # Lets update our headers with the headers from the new request:
+        # http://tools.ietf.org/html/draft-ietf-httpbis-p4-conditional-26#section-4.1
+        #
+        # The server isn't supposed to send headers that would make
+        # the cached body invalid. But... just in case, we'll be sure
+        # to strip out ones we know that might be problmatic due to
+        # typical assumptions.
+        excluded_headers = ["content-length"]
+
+        cached_response.headers.update(
+            {
+                k: v
+                for k, v in response.headers.items()
+                if k.lower() not in excluded_headers
+            }
+        )
+
+        # we want a 200 b/c we have content via the cache
+        cached_response.status = 200
+
+        # update our cache
+        self._cache_set(cache_url, request, cached_response)
+
+        return cached_response
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/filewrapper.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/filewrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..25143902a26b6c335c34a8304ba780ac15fb6704
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/filewrapper.py
@@ -0,0 +1,119 @@
+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+import mmap
+from tempfile import NamedTemporaryFile
+from typing import TYPE_CHECKING, Any, Callable
+
+if TYPE_CHECKING:
+    from http.client import HTTPResponse
+
+
+class CallbackFileWrapper:
+    """
+    Small wrapper around a fp object which will tee everything read into a
+    buffer, and when that file is closed it will execute a callback with the
+    contents of that buffer.
+
+    All attributes are proxied to the underlying file object.
+
+    This class uses members with a double underscore (__) leading prefix so as
+    not to accidentally shadow an attribute.
+
+    The data is stored in a temporary file until it is all available.  As long
+    as the temporary files directory is disk-based (sometimes it's a
+    memory-backed-``tmpfs`` on Linux), data will be unloaded to disk if memory
+    pressure is high.  For small files the disk usually won't be used at all,
+    it'll all be in the filesystem memory cache, so there should be no
+    performance impact.
+    """
+
+    def __init__(
+        self, fp: HTTPResponse, callback: Callable[[bytes], None] | None
+    ) -> None:
+        self.__buf = NamedTemporaryFile("rb+", delete=True)
+        self.__fp = fp
+        self.__callback = callback
+
+    def __getattr__(self, name: str) -> Any:
+        # The vaguaries of garbage collection means that self.__fp is
+        # not always set.  By using __getattribute__ and the private
+        # name[0] allows looking up the attribute value and raising an
+        # AttributeError when it doesn't exist. This stop thigns from
+        # infinitely recursing calls to getattr in the case where
+        # self.__fp hasn't been set.
+        #
+        # [0] https://docs.python.org/2/reference/expressions.html#atom-identifiers
+        fp = self.__getattribute__("_CallbackFileWrapper__fp")
+        return getattr(fp, name)
+
+    def __is_fp_closed(self) -> bool:
+        try:
+            return self.__fp.fp is None
+
+        except AttributeError:
+            pass
+
+        try:
+            closed: bool = self.__fp.closed
+            return closed
+
+        except AttributeError:
+            pass
+
+        # We just don't cache it then.
+        # TODO: Add some logging here...
+        return False
+
+    def _close(self) -> None:
+        if self.__callback:
+            if self.__buf.tell() == 0:
+                # Empty file:
+                result = b""
+            else:
+                # Return the data without actually loading it into memory,
+                # relying on Python's buffer API and mmap(). mmap() just gives
+                # a view directly into the filesystem's memory cache, so it
+                # doesn't result in duplicate memory use.
+                self.__buf.seek(0, 0)
+                result = memoryview(
+                    mmap.mmap(self.__buf.fileno(), 0, access=mmap.ACCESS_READ)
+                )
+            self.__callback(result)
+
+        # We assign this to None here, because otherwise we can get into
+        # really tricky problems where the CPython interpreter dead locks
+        # because the callback is holding a reference to something which
+        # has a __del__ method. Setting this to None breaks the cycle
+        # and allows the garbage collector to do it's thing normally.
+        self.__callback = None
+
+        # Closing the temporary file releases memory and frees disk space.
+        # Important when caching big files.
+        self.__buf.close()
+
+    def read(self, amt: int | None = None) -> bytes:
+        data: bytes = self.__fp.read(amt)
+        if data:
+            # We may be dealing with b'', a sign that things are over:
+            # it's passed e.g. after we've already closed self.__buf.
+            self.__buf.write(data)
+        if self.__is_fp_closed():
+            self._close()
+
+        return data
+
+    def _safe_read(self, amt: int) -> bytes:
+        data: bytes = self.__fp._safe_read(amt)  # type: ignore[attr-defined]
+        if amt == 2 and data == b"\r\n":
+            # urllib executes this read to toss the CRLF at the end
+            # of the chunk.
+            return data
+
+        self.__buf.write(data)
+        if self.__is_fp_closed():
+            self._close()
+
+        return data
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/py.typed b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/py.typed
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/wrapper.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..f618bc363f12fb416b048e2a86cbddb6082874d6
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/cachecontrol/wrapper.py
@@ -0,0 +1,43 @@
+# SPDX-FileCopyrightText: 2015 Eric Larson
+#
+# SPDX-License-Identifier: Apache-2.0
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, Collection
+
+from pip._vendor.cachecontrol.adapter import CacheControlAdapter
+from pip._vendor.cachecontrol.cache import DictCache
+
+if TYPE_CHECKING:
+    from pip._vendor import requests
+
+    from pip._vendor.cachecontrol.cache import BaseCache
+    from pip._vendor.cachecontrol.controller import CacheController
+    from pip._vendor.cachecontrol.heuristics import BaseHeuristic
+    from pip._vendor.cachecontrol.serialize import Serializer
+
+
+def CacheControl(
+    sess: requests.Session,
+    cache: BaseCache | None = None,
+    cache_etags: bool = True,
+    serializer: Serializer | None = None,
+    heuristic: BaseHeuristic | None = None,
+    controller_class: type[CacheController] | None = None,
+    adapter_class: type[CacheControlAdapter] | None = None,
+    cacheable_methods: Collection[str] | None = None,
+) -> requests.Session:
+    cache = DictCache() if cache is None else cache
+    adapter_class = adapter_class or CacheControlAdapter
+    adapter = adapter_class(
+        cache,
+        cache_etags=cache_etags,
+        serializer=serializer,
+        heuristic=heuristic,
+        controller_class=controller_class,
+        cacheable_methods=cacheable_methods,
+    )
+    sess.mount("http://", adapter)
+    sess.mount("https://", adapter)
+
+    return sess
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pygments/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pygments/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6aaa6e23166c7a5d57d8fde7485e8c84777f4eec
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pygments/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pygments/__pycache__/lexer.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pygments/__pycache__/lexer.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..874dda964ece6caf02586ed3e9a7a14f1fd7564d
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/pygments/__pycache__/lexer.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__init__.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d92acc7bedfc5c7c05130986a256e610640582e5
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__init__.py
@@ -0,0 +1,26 @@
+__all__ = [
+    "__version__",
+    "AbstractProvider",
+    "AbstractResolver",
+    "BaseReporter",
+    "InconsistentCandidate",
+    "Resolver",
+    "RequirementsConflicted",
+    "ResolutionError",
+    "ResolutionImpossible",
+    "ResolutionTooDeep",
+]
+
+__version__ = "1.0.1"
+
+
+from .providers import AbstractProvider, AbstractResolver
+from .reporters import BaseReporter
+from .resolvers import (
+    InconsistentCandidate,
+    RequirementsConflicted,
+    ResolutionError,
+    ResolutionImpossible,
+    ResolutionTooDeep,
+    Resolver,
+)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/__init__.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd71034ff325c52eecd0c5d11e805174bb7a04a7
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/resolvers.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/resolvers.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aacc1e18664aaf0169a8dbc9dd62ee82871869f7
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/resolvers.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/structs.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/structs.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5959becda620237033f79edc05a79c29c5be11a
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/__pycache__/structs.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/reporters.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/reporters.py
new file mode 100644
index 0000000000000000000000000000000000000000..688b5e10d8608fdb324c5df0ec3d9f4aa720de0e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/resolvelib/reporters.py
@@ -0,0 +1,43 @@
+class BaseReporter(object):
+    """Delegate class to provider progress reporting for the resolver."""
+
+    def starting(self):
+        """Called before the resolution actually starts."""
+
+    def starting_round(self, index):
+        """Called before each round of resolution starts.
+
+        The index is zero-based.
+        """
+
+    def ending_round(self, index, state):
+        """Called before each round of resolution ends.
+
+        This is NOT called if the resolution ends at this round. Use `ending`
+        if you want to report finalization. The index is zero-based.
+        """
+
+    def ending(self, state):
+        """Called before the resolution ends successfully."""
+
+    def adding_requirement(self, requirement, parent):
+        """Called when adding a new requirement into the resolve criteria.
+
+        :param requirement: The additional requirement to be applied to filter
+            the available candidaites.
+        :param parent: The candidate that requires ``requirement`` as a
+            dependency, or None if ``requirement`` is one of the root
+            requirements passed in from ``Resolver.resolve()``.
+        """
+
+    def resolving_conflicts(self, causes):
+        """Called when starting to attempt requirement conflict resolution.
+
+        :param causes: The information on the collision that caused the backtracking.
+        """
+
+    def rejecting_candidate(self, criterion, candidate):
+        """Called when rejecting a candidate during backtracking."""
+
+    def pinning(self, candidate):
+        """Called when adding a candidate to the potential solution."""
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_macos.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_macos.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8afaadc45b70f43f28acdf1e6dee6aef3c402ede
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_macos.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_openssl.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_openssl.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89ca4ff6dd3d4720dde22e9532f297659f3a4ddb
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_openssl.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_ssl_constants.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_ssl_constants.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eedee3c7b1956e04dfd95ebeb25f12c7d2271926
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/__pycache__/_ssl_constants.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/_macos.py b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/_macos.py
new file mode 100644
index 0000000000000000000000000000000000000000..345030772441102eda1be73661469a5f726d668a
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pip/_vendor/truststore/_macos.py
@@ -0,0 +1,571 @@
+import contextlib
+import ctypes
+import platform
+import ssl
+import typing
+from ctypes import (
+    CDLL,
+    POINTER,
+    c_bool,
+    c_char_p,
+    c_int32,
+    c_long,
+    c_uint32,
+    c_ulong,
+    c_void_p,
+)
+from ctypes.util import find_library
+
+from ._ssl_constants import _set_ssl_context_verify_mode
+
+_mac_version = platform.mac_ver()[0]
+_mac_version_info = tuple(map(int, _mac_version.split(".")))
+if _mac_version_info < (10, 8):
+    raise ImportError(
+        f"Only OS X 10.8 and newer are supported, not {_mac_version_info[0]}.{_mac_version_info[1]}"
+    )
+
+_is_macos_version_10_14_or_later = _mac_version_info >= (10, 14)
+
+
+def _load_cdll(name: str, macos10_16_path: str) -> CDLL:
+    """Loads a CDLL by name, falling back to known path on 10.16+"""
+    try:
+        # Big Sur is technically 11 but we use 10.16 due to the Big Sur
+        # beta being labeled as 10.16.
+        path: str | None
+        if _mac_version_info >= (10, 16):
+            path = macos10_16_path
+        else:
+            path = find_library(name)
+        if not path:
+            raise OSError  # Caught and reraised as 'ImportError'
+        return CDLL(path, use_errno=True)
+    except OSError:
+        raise ImportError(f"The library {name} failed to load") from None
+
+
+Security = _load_cdll(
+    "Security", "/System/Library/Frameworks/Security.framework/Security"
+)
+CoreFoundation = _load_cdll(
+    "CoreFoundation",
+    "/System/Library/Frameworks/CoreFoundation.framework/CoreFoundation",
+)
+
+Boolean = c_bool
+CFIndex = c_long
+CFStringEncoding = c_uint32
+CFData = c_void_p
+CFString = c_void_p
+CFArray = c_void_p
+CFMutableArray = c_void_p
+CFError = c_void_p
+CFType = c_void_p
+CFTypeID = c_ulong
+CFTypeRef = POINTER(CFType)
+CFAllocatorRef = c_void_p
+
+OSStatus = c_int32
+
+CFErrorRef = POINTER(CFError)
+CFDataRef = POINTER(CFData)
+CFStringRef = POINTER(CFString)
+CFArrayRef = POINTER(CFArray)
+CFMutableArrayRef = POINTER(CFMutableArray)
+CFArrayCallBacks = c_void_p
+CFOptionFlags = c_uint32
+
+SecCertificateRef = POINTER(c_void_p)
+SecPolicyRef = POINTER(c_void_p)
+SecTrustRef = POINTER(c_void_p)
+SecTrustResultType = c_uint32
+SecTrustOptionFlags = c_uint32
+
+try:
+    Security.SecCertificateCreateWithData.argtypes = [CFAllocatorRef, CFDataRef]
+    Security.SecCertificateCreateWithData.restype = SecCertificateRef
+
+    Security.SecCertificateCopyData.argtypes = [SecCertificateRef]
+    Security.SecCertificateCopyData.restype = CFDataRef
+
+    Security.SecCopyErrorMessageString.argtypes = [OSStatus, c_void_p]
+    Security.SecCopyErrorMessageString.restype = CFStringRef
+
+    Security.SecTrustSetAnchorCertificates.argtypes = [SecTrustRef, CFArrayRef]
+    Security.SecTrustSetAnchorCertificates.restype = OSStatus
+
+    Security.SecTrustSetAnchorCertificatesOnly.argtypes = [SecTrustRef, Boolean]
+    Security.SecTrustSetAnchorCertificatesOnly.restype = OSStatus
+
+    Security.SecPolicyCreateRevocation.argtypes = [CFOptionFlags]
+    Security.SecPolicyCreateRevocation.restype = SecPolicyRef
+
+    Security.SecPolicyCreateSSL.argtypes = [Boolean, CFStringRef]
+    Security.SecPolicyCreateSSL.restype = SecPolicyRef
+
+    Security.SecTrustCreateWithCertificates.argtypes = [
+        CFTypeRef,
+        CFTypeRef,
+        POINTER(SecTrustRef),
+    ]
+    Security.SecTrustCreateWithCertificates.restype = OSStatus
+
+    Security.SecTrustGetTrustResult.argtypes = [
+        SecTrustRef,
+        POINTER(SecTrustResultType),
+    ]
+    Security.SecTrustGetTrustResult.restype = OSStatus
+
+    Security.SecTrustEvaluate.argtypes = [
+        SecTrustRef,
+        POINTER(SecTrustResultType),
+    ]
+    Security.SecTrustEvaluate.restype = OSStatus
+
+    Security.SecTrustRef = SecTrustRef  # type: ignore[attr-defined]
+    Security.SecTrustResultType = SecTrustResultType  # type: ignore[attr-defined]
+    Security.OSStatus = OSStatus  # type: ignore[attr-defined]
+
+    kSecRevocationUseAnyAvailableMethod = 3
+    kSecRevocationRequirePositiveResponse = 8
+
+    CoreFoundation.CFRelease.argtypes = [CFTypeRef]
+    CoreFoundation.CFRelease.restype = None
+
+    CoreFoundation.CFGetTypeID.argtypes = [CFTypeRef]
+    CoreFoundation.CFGetTypeID.restype = CFTypeID
+
+    CoreFoundation.CFStringCreateWithCString.argtypes = [
+        CFAllocatorRef,
+        c_char_p,
+        CFStringEncoding,
+    ]
+    CoreFoundation.CFStringCreateWithCString.restype = CFStringRef
+
+    CoreFoundation.CFStringGetCStringPtr.argtypes = [CFStringRef, CFStringEncoding]
+    CoreFoundation.CFStringGetCStringPtr.restype = c_char_p
+
+    CoreFoundation.CFStringGetCString.argtypes = [
+        CFStringRef,
+        c_char_p,
+        CFIndex,
+        CFStringEncoding,
+    ]
+    CoreFoundation.CFStringGetCString.restype = c_bool
+
+    CoreFoundation.CFDataCreate.argtypes = [CFAllocatorRef, c_char_p, CFIndex]
+    CoreFoundation.CFDataCreate.restype = CFDataRef
+
+    CoreFoundation.CFDataGetLength.argtypes = [CFDataRef]
+    CoreFoundation.CFDataGetLength.restype = CFIndex
+
+    CoreFoundation.CFDataGetBytePtr.argtypes = [CFDataRef]
+    CoreFoundation.CFDataGetBytePtr.restype = c_void_p
+
+    CoreFoundation.CFArrayCreate.argtypes = [
+        CFAllocatorRef,
+        POINTER(CFTypeRef),
+        CFIndex,
+        CFArrayCallBacks,
+    ]
+    CoreFoundation.CFArrayCreate.restype = CFArrayRef
+
+    CoreFoundation.CFArrayCreateMutable.argtypes = [
+        CFAllocatorRef,
+        CFIndex,
+        CFArrayCallBacks,
+    ]
+    CoreFoundation.CFArrayCreateMutable.restype = CFMutableArrayRef
+
+    CoreFoundation.CFArrayAppendValue.argtypes = [CFMutableArrayRef, c_void_p]
+    CoreFoundation.CFArrayAppendValue.restype = None
+
+    CoreFoundation.CFArrayGetCount.argtypes = [CFArrayRef]
+    CoreFoundation.CFArrayGetCount.restype = CFIndex
+
+    CoreFoundation.CFArrayGetValueAtIndex.argtypes = [CFArrayRef, CFIndex]
+    CoreFoundation.CFArrayGetValueAtIndex.restype = c_void_p
+
+    CoreFoundation.CFErrorGetCode.argtypes = [CFErrorRef]
+    CoreFoundation.CFErrorGetCode.restype = CFIndex
+
+    CoreFoundation.CFErrorCopyDescription.argtypes = [CFErrorRef]
+    CoreFoundation.CFErrorCopyDescription.restype = CFStringRef
+
+    CoreFoundation.kCFAllocatorDefault = CFAllocatorRef.in_dll(  # type: ignore[attr-defined]
+        CoreFoundation, "kCFAllocatorDefault"
+    )
+    CoreFoundation.kCFTypeArrayCallBacks = c_void_p.in_dll(  # type: ignore[attr-defined]
+        CoreFoundation, "kCFTypeArrayCallBacks"
+    )
+
+    CoreFoundation.CFTypeRef = CFTypeRef  # type: ignore[attr-defined]
+    CoreFoundation.CFArrayRef = CFArrayRef  # type: ignore[attr-defined]
+    CoreFoundation.CFStringRef = CFStringRef  # type: ignore[attr-defined]
+    CoreFoundation.CFErrorRef = CFErrorRef  # type: ignore[attr-defined]
+
+except AttributeError as e:
+    raise ImportError(f"Error initializing ctypes: {e}") from None
+
+# SecTrustEvaluateWithError is macOS 10.14+
+if _is_macos_version_10_14_or_later:
+    try:
+        Security.SecTrustEvaluateWithError.argtypes = [
+            SecTrustRef,
+            POINTER(CFErrorRef),
+        ]
+        Security.SecTrustEvaluateWithError.restype = c_bool
+    except AttributeError as e:
+        raise ImportError(f"Error initializing ctypes: {e}") from None
+
+
+def _handle_osstatus(result: OSStatus, _: typing.Any, args: typing.Any) -> typing.Any:
+    """
+    Raises an error if the OSStatus value is non-zero.
+    """
+    if int(result) == 0:
+        return args
+
+    # Returns a CFString which we need to transform
+    # into a UTF-8 Python string.
+    error_message_cfstring = None
+    try:
+        error_message_cfstring = Security.SecCopyErrorMessageString(result, None)
+
+        # First step is convert the CFString into a C string pointer.
+        # We try the fast no-copy way first.
+        error_message_cfstring_c_void_p = ctypes.cast(
+            error_message_cfstring, ctypes.POINTER(ctypes.c_void_p)
+        )
+        message = CoreFoundation.CFStringGetCStringPtr(
+            error_message_cfstring_c_void_p, CFConst.kCFStringEncodingUTF8
+        )
+
+        # Quoting the Apple dev docs:
+        #
+        # "A pointer to a C string or NULL if the internal
+        # storage of theString does not allow this to be
+        # returned efficiently."
+        #
+        # So we need to get our hands dirty.
+        if message is None:
+            buffer = ctypes.create_string_buffer(1024)
+            result = CoreFoundation.CFStringGetCString(
+                error_message_cfstring_c_void_p,
+                buffer,
+                1024,
+                CFConst.kCFStringEncodingUTF8,
+            )
+            if not result:
+                raise OSError("Error copying C string from CFStringRef")
+            message = buffer.value
+
+    finally:
+        if error_message_cfstring is not None:
+            CoreFoundation.CFRelease(error_message_cfstring)
+
+    # If no message can be found for this status we come
+    # up with a generic one that forwards the status code.
+    if message is None or message == "":
+        message = f"SecureTransport operation returned a non-zero OSStatus: {result}"
+
+    raise ssl.SSLError(message)
+
+
+Security.SecTrustCreateWithCertificates.errcheck = _handle_osstatus  # type: ignore[assignment]
+Security.SecTrustSetAnchorCertificates.errcheck = _handle_osstatus  # type: ignore[assignment]
+Security.SecTrustSetAnchorCertificatesOnly.errcheck = _handle_osstatus  # type: ignore[assignment]
+Security.SecTrustGetTrustResult.errcheck = _handle_osstatus  # type: ignore[assignment]
+Security.SecTrustEvaluate.errcheck = _handle_osstatus  # type: ignore[assignment]
+
+
+class CFConst:
+    """CoreFoundation constants"""
+
+    kCFStringEncodingUTF8 = CFStringEncoding(0x08000100)
+
+    errSecIncompleteCertRevocationCheck = -67635
+    errSecHostNameMismatch = -67602
+    errSecCertificateExpired = -67818
+    errSecNotTrusted = -67843
+
+
+def _bytes_to_cf_data_ref(value: bytes) -> CFDataRef:  # type: ignore[valid-type]
+    return CoreFoundation.CFDataCreate(  # type: ignore[no-any-return]
+        CoreFoundation.kCFAllocatorDefault, value, len(value)
+    )
+
+
+def _bytes_to_cf_string(value: bytes) -> CFString:
+    """
+    Given a Python binary data, create a CFString.
+    The string must be CFReleased by the caller.
+    """
+    c_str = ctypes.c_char_p(value)
+    cf_str = CoreFoundation.CFStringCreateWithCString(
+        CoreFoundation.kCFAllocatorDefault,
+        c_str,
+        CFConst.kCFStringEncodingUTF8,
+    )
+    return cf_str  # type: ignore[no-any-return]
+
+
+def _cf_string_ref_to_str(cf_string_ref: CFStringRef) -> str | None:  # type: ignore[valid-type]
+    """
+    Creates a Unicode string from a CFString object. Used entirely for error
+    reporting.
+    Yes, it annoys me quite a lot that this function is this complex.
+    """
+
+    string = CoreFoundation.CFStringGetCStringPtr(
+        cf_string_ref, CFConst.kCFStringEncodingUTF8
+    )
+    if string is None:
+        buffer = ctypes.create_string_buffer(1024)
+        result = CoreFoundation.CFStringGetCString(
+            cf_string_ref, buffer, 1024, CFConst.kCFStringEncodingUTF8
+        )
+        if not result:
+            raise OSError("Error copying C string from CFStringRef")
+        string = buffer.value
+    if string is not None:
+        string = string.decode("utf-8")
+    return string  # type: ignore[no-any-return]
+
+
+def _der_certs_to_cf_cert_array(certs: list[bytes]) -> CFMutableArrayRef:  # type: ignore[valid-type]
+    """Builds a CFArray of SecCertificateRefs from a list of DER-encoded certificates.
+    Responsibility of the caller to call CoreFoundation.CFRelease on the CFArray.
+    """
+    cf_array = CoreFoundation.CFArrayCreateMutable(
+        CoreFoundation.kCFAllocatorDefault,
+        0,
+        ctypes.byref(CoreFoundation.kCFTypeArrayCallBacks),
+    )
+    if not cf_array:
+        raise MemoryError("Unable to allocate memory!")
+
+    for cert_data in certs:
+        cf_data = None
+        sec_cert_ref = None
+        try:
+            cf_data = _bytes_to_cf_data_ref(cert_data)
+            sec_cert_ref = Security.SecCertificateCreateWithData(
+                CoreFoundation.kCFAllocatorDefault, cf_data
+            )
+            CoreFoundation.CFArrayAppendValue(cf_array, sec_cert_ref)
+        finally:
+            if cf_data:
+                CoreFoundation.CFRelease(cf_data)
+            if sec_cert_ref:
+                CoreFoundation.CFRelease(sec_cert_ref)
+
+    return cf_array  # type: ignore[no-any-return]
+
+
+@contextlib.contextmanager
+def _configure_context(ctx: ssl.SSLContext) -> typing.Iterator[None]:
+    check_hostname = ctx.check_hostname
+    verify_mode = ctx.verify_mode
+    ctx.check_hostname = False
+    _set_ssl_context_verify_mode(ctx, ssl.CERT_NONE)
+    try:
+        yield
+    finally:
+        ctx.check_hostname = check_hostname
+        _set_ssl_context_verify_mode(ctx, verify_mode)
+
+
+def _verify_peercerts_impl(
+    ssl_context: ssl.SSLContext,
+    cert_chain: list[bytes],
+    server_hostname: str | None = None,
+) -> None:
+    certs = None
+    policies = None
+    trust = None
+    try:
+        # Only set a hostname on the policy if we're verifying the hostname
+        # on the leaf certificate.
+        if server_hostname is not None and ssl_context.check_hostname:
+            cf_str_hostname = None
+            try:
+                cf_str_hostname = _bytes_to_cf_string(server_hostname.encode("ascii"))
+                ssl_policy = Security.SecPolicyCreateSSL(True, cf_str_hostname)
+            finally:
+                if cf_str_hostname:
+                    CoreFoundation.CFRelease(cf_str_hostname)
+        else:
+            ssl_policy = Security.SecPolicyCreateSSL(True, None)
+
+        policies = ssl_policy
+        if ssl_context.verify_flags & ssl.VERIFY_CRL_CHECK_CHAIN:
+            # Add explicit policy requiring positive revocation checks
+            policies = CoreFoundation.CFArrayCreateMutable(
+                CoreFoundation.kCFAllocatorDefault,
+                0,
+                ctypes.byref(CoreFoundation.kCFTypeArrayCallBacks),
+            )
+            CoreFoundation.CFArrayAppendValue(policies, ssl_policy)
+            CoreFoundation.CFRelease(ssl_policy)
+            revocation_policy = Security.SecPolicyCreateRevocation(
+                kSecRevocationUseAnyAvailableMethod
+                | kSecRevocationRequirePositiveResponse
+            )
+            CoreFoundation.CFArrayAppendValue(policies, revocation_policy)
+            CoreFoundation.CFRelease(revocation_policy)
+        elif ssl_context.verify_flags & ssl.VERIFY_CRL_CHECK_LEAF:
+            raise NotImplementedError("VERIFY_CRL_CHECK_LEAF not implemented for macOS")
+
+        certs = None
+        try:
+            certs = _der_certs_to_cf_cert_array(cert_chain)
+
+            # Now that we have certificates loaded and a SecPolicy
+            # we can finally create a SecTrust object!
+            trust = Security.SecTrustRef()
+            Security.SecTrustCreateWithCertificates(
+                certs, policies, ctypes.byref(trust)
+            )
+
+        finally:
+            # The certs are now being held by SecTrust so we can
+            # release our handles for the array.
+            if certs:
+                CoreFoundation.CFRelease(certs)
+
+        # If there are additional trust anchors to load we need to transform
+        # the list of DER-encoded certificates into a CFArray.
+        ctx_ca_certs_der: list[bytes] | None = ssl_context.get_ca_certs(
+            binary_form=True
+        )
+        if ctx_ca_certs_der:
+            ctx_ca_certs = None
+            try:
+                ctx_ca_certs = _der_certs_to_cf_cert_array(ctx_ca_certs_der)
+                Security.SecTrustSetAnchorCertificates(trust, ctx_ca_certs)
+            finally:
+                if ctx_ca_certs:
+                    CoreFoundation.CFRelease(ctx_ca_certs)
+
+        # We always want system certificates.
+        Security.SecTrustSetAnchorCertificatesOnly(trust, False)
+
+        # macOS 10.13 and earlier don't support SecTrustEvaluateWithError()
+        # so we use SecTrustEvaluate() which means we need to construct error
+        # messages ourselves.
+        if _is_macos_version_10_14_or_later:
+            _verify_peercerts_impl_macos_10_14(ssl_context, trust)
+        else:
+            _verify_peercerts_impl_macos_10_13(ssl_context, trust)
+    finally:
+        if policies:
+            CoreFoundation.CFRelease(policies)
+        if trust:
+            CoreFoundation.CFRelease(trust)
+
+
+def _verify_peercerts_impl_macos_10_13(
+    ssl_context: ssl.SSLContext, sec_trust_ref: typing.Any
+) -> None:
+    """Verify using 'SecTrustEvaluate' API for macOS 10.13 and earlier.
+    macOS 10.14 added the 'SecTrustEvaluateWithError' API.
+    """
+    sec_trust_result_type = Security.SecTrustResultType()
+    Security.SecTrustEvaluate(sec_trust_ref, ctypes.byref(sec_trust_result_type))
+
+    try:
+        sec_trust_result_type_as_int = int(sec_trust_result_type.value)
+    except (ValueError, TypeError):
+        sec_trust_result_type_as_int = -1
+
+    # Apple doesn't document these values in their own API docs.
+    # See: https://github.com/xybp888/iOS-SDKs/blob/master/iPhoneOS13.0.sdk/System/Library/Frameworks/Security.framework/Headers/SecTrust.h#L84
+    if (
+        ssl_context.verify_mode == ssl.CERT_REQUIRED
+        and sec_trust_result_type_as_int not in (1, 4)
+    ):
+        # Note that we're not able to ignore only hostname errors
+        # for macOS 10.13 and earlier, so check_hostname=False will
+        # still return an error.
+        sec_trust_result_type_to_message = {
+            0: "Invalid trust result type",
+            # 1: "Trust evaluation succeeded",
+            2: "User confirmation required",
+            3: "User specified that certificate is not trusted",
+            # 4: "Trust result is unspecified",
+            5: "Recoverable trust failure occurred",
+            6: "Fatal trust failure occurred",
+            7: "Other error occurred, certificate may be revoked",
+        }
+        error_message = sec_trust_result_type_to_message.get(
+            sec_trust_result_type_as_int,
+            f"Unknown trust result: {sec_trust_result_type_as_int}",
+        )
+
+        err = ssl.SSLCertVerificationError(error_message)
+        err.verify_message = error_message
+        err.verify_code = sec_trust_result_type_as_int
+        raise err
+
+
+def _verify_peercerts_impl_macos_10_14(
+    ssl_context: ssl.SSLContext, sec_trust_ref: typing.Any
+) -> None:
+    """Verify using 'SecTrustEvaluateWithError' API for macOS 10.14+."""
+    cf_error = CoreFoundation.CFErrorRef()
+    sec_trust_eval_result = Security.SecTrustEvaluateWithError(
+        sec_trust_ref, ctypes.byref(cf_error)
+    )
+    # sec_trust_eval_result is a bool (0 or 1)
+    # where 1 means that the certs are trusted.
+    if sec_trust_eval_result == 1:
+        is_trusted = True
+    elif sec_trust_eval_result == 0:
+        is_trusted = False
+    else:
+        raise ssl.SSLError(
+            f"Unknown result from Security.SecTrustEvaluateWithError: {sec_trust_eval_result!r}"
+        )
+
+    cf_error_code = 0
+    if not is_trusted:
+        cf_error_code = CoreFoundation.CFErrorGetCode(cf_error)
+
+        # If the error is a known failure that we're
+        # explicitly okay with from SSLContext configuration
+        # we can set is_trusted accordingly.
+        if ssl_context.verify_mode != ssl.CERT_REQUIRED and (
+            cf_error_code == CFConst.errSecNotTrusted
+            or cf_error_code == CFConst.errSecCertificateExpired
+        ):
+            is_trusted = True
+
+    # If we're still not trusted then we start to
+    # construct and raise the SSLCertVerificationError.
+    if not is_trusted:
+        cf_error_string_ref = None
+        try:
+            cf_error_string_ref = CoreFoundation.CFErrorCopyDescription(cf_error)
+
+            # Can this ever return 'None' if there's a CFError?
+            cf_error_message = (
+                _cf_string_ref_to_str(cf_error_string_ref)
+                or "Certificate verification failed"
+            )
+
+            # TODO: Not sure if we need the SecTrustResultType for anything?
+            # We only care whether or not it's a success or failure for now.
+            sec_trust_result_type = Security.SecTrustResultType()
+            Security.SecTrustGetTrustResult(
+                sec_trust_ref, ctypes.byref(sec_trust_result_type)
+            )
+
+            err = ssl.SSLCertVerificationError(cf_error_message)
+            err.verify_message = cf_error_message
+            err.verify_code = cf_error_code
+            raise err
+        finally:
+            if cf_error_string_ref:
+                CoreFoundation.CFRelease(cf_error_string_ref)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/__pycache__/commands.cpython-311.pyc b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/__pycache__/commands.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0919af6a4496fe653b4c0663ad1756f3d8b6ff8c
Binary files /dev/null and b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/__pycache__/commands.cpython-311.pyc differ
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/class.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/class.h
new file mode 100644
index 0000000000000000000000000000000000000000..b990507d629b4260d66d51e23a7f34a0fa465c9e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/class.h
@@ -0,0 +1,767 @@
+/*
+    pybind11/detail/class.h: Python C API implementation details for py::class_
+
+    Copyright (c) 2017 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <pybind11/attr.h>
+#include <pybind11/options.h>
+
+#include "exception_translation.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if !defined(PYPY_VERSION)
+#    define PYBIND11_BUILTIN_QUALNAME
+#    define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)
+#else
+// In PyPy, we still set __qualname__ so that we can produce reliable function type
+// signatures; in CPython this macro expands to nothing:
+#    define PYBIND11_SET_OLDPY_QUALNAME(obj, nameobj)                                             \
+        setattr((PyObject *) obj, "__qualname__", nameobj)
+#endif
+
+inline std::string get_fully_qualified_tp_name(PyTypeObject *type) {
+#if !defined(PYPY_VERSION)
+    return type->tp_name;
+#else
+    auto module_name = handle((PyObject *) type).attr("__module__").cast<std::string>();
+    if (module_name == PYBIND11_BUILTINS_MODULE)
+        return type->tp_name;
+    else
+        return std::move(module_name) + "." + type->tp_name;
+#endif
+}
+
+inline PyTypeObject *type_incref(PyTypeObject *type) {
+    Py_INCREF(type);
+    return type;
+}
+
+#if !defined(PYPY_VERSION)
+
+/// `pybind11_static_property.__get__()`: Always pass the class instead of the instance.
+extern "C" inline PyObject *pybind11_static_get(PyObject *self, PyObject * /*ob*/, PyObject *cls) {
+    return PyProperty_Type.tp_descr_get(self, cls, cls);
+}
+
+/// `pybind11_static_property.__set__()`: Just like the above `__get__()`.
+extern "C" inline int pybind11_static_set(PyObject *self, PyObject *obj, PyObject *value) {
+    PyObject *cls = PyType_Check(obj) ? obj : (PyObject *) Py_TYPE(obj);
+    return PyProperty_Type.tp_descr_set(self, cls, value);
+}
+
+// Forward declaration to use in `make_static_property_type()`
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type);
+
+/** A `static_property` is the same as a `property` but the `__get__()` and `__set__()`
+    methods are modified to always use the object type instead of a concrete instance.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    constexpr auto *name = "pybind11_static_property";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type) {
+        pybind11_fail("make_static_property_type(): error allocating type!");
+    }
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#    ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#    endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyProperty_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+    type->tp_descr_get = pybind11_static_get;
+    type->tp_descr_set = pybind11_static_set;
+
+#    if PY_VERSION_HEX >= 0x030C0000
+    // Since Python-3.12 property-derived types are required to
+    // have dynamic attributes (to set `__doc__`)
+    enable_dynamic_attributes(heap_type);
+#    endif
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail("make_static_property_type(): failure in PyType_Ready()!");
+    }
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+#else // PYPY
+
+/** PyPy has some issues with the above C API, so we evaluate Python code instead.
+    This function will only be called once so performance isn't really a concern.
+    Return value: New reference. */
+inline PyTypeObject *make_static_property_type() {
+    auto d = dict();
+    PyObject *result = PyRun_String(R"(\
+class pybind11_static_property(property):
+    def __get__(self, obj, cls):
+        return property.__get__(self, cls, cls)
+
+    def __set__(self, obj, value):
+        cls = obj if isinstance(obj, type) else type(obj)
+        property.__set__(self, cls, value)
+)",
+                                    Py_file_input,
+                                    d.ptr(),
+                                    d.ptr());
+    if (result == nullptr)
+        throw error_already_set();
+    Py_DECREF(result);
+    return (PyTypeObject *) d["pybind11_static_property"].cast<object>().release().ptr();
+}
+
+#endif // PYPY
+
+/** Types with static properties need to handle `Type.static_prop = x` in a specific way.
+    By default, Python replaces the `static_property` itself, but for wrapped C++ types
+    we need to call `static_property.__set__()` in order to propagate the new value to
+    the underlying C++ data structure. */
+extern "C" inline int pybind11_meta_setattro(PyObject *obj, PyObject *name, PyObject *value) {
+    // Use `_PyType_Lookup()` instead of `PyObject_GetAttr()` in order to get the raw
+    // descriptor (`property`) instead of calling `tp_descr_get` (`property.__get__()`).
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+
+    // The following assignment combinations are possible:
+    //   1. `Type.static_prop = value`             --> descr_set: `Type.static_prop.__set__(value)`
+    //   2. `Type.static_prop = other_static_prop` --> setattro:  replace existing `static_prop`
+    //   3. `Type.regular_attribute = value`       --> setattro:  regular attribute assignment
+    auto *const static_prop = (PyObject *) get_internals().static_property_type;
+    const auto call_descr_set = (descr != nullptr) && (value != nullptr)
+                                && (PyObject_IsInstance(descr, static_prop) != 0)
+                                && (PyObject_IsInstance(value, static_prop) == 0);
+    if (call_descr_set) {
+        // Call `static_property.__set__()` instead of replacing the `static_property`.
+#if !defined(PYPY_VERSION)
+        return Py_TYPE(descr)->tp_descr_set(descr, obj, value);
+#else
+        if (PyObject *result = PyObject_CallMethod(descr, "__set__", "OO", obj, value)) {
+            Py_DECREF(result);
+            return 0;
+        } else {
+            return -1;
+        }
+#endif
+    } else {
+        // Replace existing attribute.
+        return PyType_Type.tp_setattro(obj, name, value);
+    }
+}
+
+/**
+ * Python 3's PyInstanceMethod_Type hides itself via its tp_descr_get, which prevents aliasing
+ * methods via cls.attr("m2") = cls.attr("m1"): instead the tp_descr_get returns a plain function,
+ * when called on a class, or a PyMethod, when called on an instance.  Override that behaviour here
+ * to do a special case bypass for PyInstanceMethod_Types.
+ */
+extern "C" inline PyObject *pybind11_meta_getattro(PyObject *obj, PyObject *name) {
+    PyObject *descr = _PyType_Lookup((PyTypeObject *) obj, name);
+    if (descr && PyInstanceMethod_Check(descr)) {
+        Py_INCREF(descr);
+        return descr;
+    }
+    return PyType_Type.tp_getattro(obj, name);
+}
+
+/// metaclass `__call__` function that is used to create all pybind11 objects.
+extern "C" inline PyObject *pybind11_meta_call(PyObject *type, PyObject *args, PyObject *kwargs) {
+
+    // use the default metaclass call to create/initialize the object
+    PyObject *self = PyType_Type.tp_call(type, args, kwargs);
+    if (self == nullptr) {
+        return nullptr;
+    }
+
+    // Ensure that the base __init__ function(s) were called
+    values_and_holders vhs(self);
+    for (const auto &vh : vhs) {
+        if (!vh.holder_constructed() && !vhs.is_redundant_value_and_holder(vh)) {
+            PyErr_Format(PyExc_TypeError,
+                         "%.200s.__init__() must be called when overriding __init__",
+                         get_fully_qualified_tp_name(vh.type->type).c_str());
+            Py_DECREF(self);
+            return nullptr;
+        }
+    }
+
+    return self;
+}
+
+/// Cleanup the type-info for a pybind11-registered type.
+extern "C" inline void pybind11_meta_dealloc(PyObject *obj) {
+    with_internals([obj](internals &internals) {
+        auto *type = (PyTypeObject *) obj;
+
+        // A pybind11-registered type will:
+        // 1) be found in internals.registered_types_py
+        // 2) have exactly one associated `detail::type_info`
+        auto found_type = internals.registered_types_py.find(type);
+        if (found_type != internals.registered_types_py.end() && found_type->second.size() == 1
+            && found_type->second[0]->type == type) {
+
+            auto *tinfo = found_type->second[0];
+            auto tindex = std::type_index(*tinfo->cpptype);
+            internals.direct_conversions.erase(tindex);
+
+            if (tinfo->module_local) {
+                get_local_internals().registered_types_cpp.erase(tindex);
+            } else {
+                internals.registered_types_cpp.erase(tindex);
+            }
+            internals.registered_types_py.erase(tinfo->type);
+
+            // Actually just `std::erase_if`, but that's only available in C++20
+            auto &cache = internals.inactive_override_cache;
+            for (auto it = cache.begin(), last = cache.end(); it != last;) {
+                if (it->first == (PyObject *) tinfo->type) {
+                    it = cache.erase(it);
+                } else {
+                    ++it;
+                }
+            }
+
+            delete tinfo;
+        }
+    });
+
+    PyType_Type.tp_dealloc(obj);
+}
+
+/** This metaclass is assigned by default to all pybind11 types and is required in order
+    for static properties to function correctly. Users may override this using `py::metaclass`.
+    Return value: New reference. */
+inline PyTypeObject *make_default_metaclass() {
+    constexpr auto *name = "pybind11_type";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *heap_type = (PyHeapTypeObject *) PyType_Type.tp_alloc(&PyType_Type, 0);
+    if (!heap_type) {
+        pybind11_fail("make_default_metaclass(): error allocating metaclass!");
+    }
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyType_Type);
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_call = pybind11_meta_call;
+
+    type->tp_setattro = pybind11_meta_setattro;
+    type->tp_getattro = pybind11_meta_getattro;
+
+    type->tp_dealloc = pybind11_meta_dealloc;
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail("make_default_metaclass(): failure in PyType_Ready()!");
+    }
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    return type;
+}
+
+/// For multiple inheritance types we need to recursively register/deregister base pointers for any
+/// base classes with pointers that are difference from the instance value pointer so that we can
+/// correctly recognize an offset base class pointer. This calls a function with any offset base
+/// ptrs.
+inline void traverse_offset_bases(void *valueptr,
+                                  const detail::type_info *tinfo,
+                                  instance *self,
+                                  bool (*f)(void * /*parentptr*/, instance * /*self*/)) {
+    for (handle h : reinterpret_borrow<tuple>(tinfo->type->tp_bases)) {
+        if (auto *parent_tinfo = get_type_info((PyTypeObject *) h.ptr())) {
+            for (auto &c : parent_tinfo->implicit_casts) {
+                if (c.first == tinfo->cpptype) {
+                    auto *parentptr = c.second(valueptr);
+                    if (parentptr != valueptr) {
+                        f(parentptr, self);
+                    }
+                    traverse_offset_bases(parentptr, parent_tinfo, self, f);
+                    break;
+                }
+            }
+        }
+    }
+}
+
+inline bool register_instance_impl(void *ptr, instance *self) {
+    with_instance_map(ptr, [&](instance_map &instances) { instances.emplace(ptr, self); });
+    return true; // unused, but gives the same signature as the deregister func
+}
+inline bool deregister_instance_impl(void *ptr, instance *self) {
+    return with_instance_map(ptr, [&](instance_map &instances) {
+        auto range = instances.equal_range(ptr);
+        for (auto it = range.first; it != range.second; ++it) {
+            if (self == it->second) {
+                instances.erase(it);
+                return true;
+            }
+        }
+        return false;
+    });
+}
+
+inline void register_instance(instance *self, void *valptr, const type_info *tinfo) {
+    register_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors) {
+        traverse_offset_bases(valptr, tinfo, self, register_instance_impl);
+    }
+}
+
+inline bool deregister_instance(instance *self, void *valptr, const type_info *tinfo) {
+    bool ret = deregister_instance_impl(valptr, self);
+    if (!tinfo->simple_ancestors) {
+        traverse_offset_bases(valptr, tinfo, self, deregister_instance_impl);
+    }
+    return ret;
+}
+
+/// Instance creation function for all pybind11 types. It allocates the internal instance layout
+/// for holding C++ objects and holders.  Allocation is done lazily (the first time the instance is
+/// cast to a reference or pointer), and initialization is done by an `__init__` function.
+inline PyObject *make_new_instance(PyTypeObject *type) {
+#if defined(PYPY_VERSION)
+    // PyPy gets tp_basicsize wrong (issue 2482) under multiple inheritance when the first
+    // inherited object is a plain Python type (i.e. not derived from an extension type).  Fix it.
+    ssize_t instance_size = static_cast<ssize_t>(sizeof(instance));
+    if (type->tp_basicsize < instance_size) {
+        type->tp_basicsize = instance_size;
+    }
+#endif
+    PyObject *self = type->tp_alloc(type, 0);
+    auto *inst = reinterpret_cast<instance *>(self);
+    // Allocate the value/holder internals:
+    inst->allocate_layout();
+
+    return self;
+}
+
+/// Instance creation function for all pybind11 types. It only allocates space for the
+/// C++ object, but doesn't call the constructor -- an `__init__` function must do that.
+extern "C" inline PyObject *pybind11_object_new(PyTypeObject *type, PyObject *, PyObject *) {
+    return make_new_instance(type);
+}
+
+/// An `__init__` function constructs the C++ object. Users should provide at least one
+/// of these using `py::init` or directly with `.def(__init__, ...)`. Otherwise, the
+/// following default function will be used which simply throws an exception.
+extern "C" inline int pybind11_object_init(PyObject *self, PyObject *, PyObject *) {
+    PyTypeObject *type = Py_TYPE(self);
+    std::string msg = get_fully_qualified_tp_name(type) + ": No constructor defined!";
+    set_error(PyExc_TypeError, msg.c_str());
+    return -1;
+}
+
+inline void add_patient(PyObject *nurse, PyObject *patient) {
+    auto *instance = reinterpret_cast<detail::instance *>(nurse);
+    instance->has_patients = true;
+    Py_INCREF(patient);
+
+    with_internals([&](internals &internals) { internals.patients[nurse].push_back(patient); });
+}
+
+inline void clear_patients(PyObject *self) {
+    auto *instance = reinterpret_cast<detail::instance *>(self);
+    std::vector<PyObject *> patients;
+
+    with_internals([&](internals &internals) {
+        auto pos = internals.patients.find(self);
+
+        if (pos == internals.patients.end()) {
+            pybind11_fail(
+                "FATAL: Internal consistency check failed: Invalid clear_patients() call.");
+        }
+
+        // Clearing the patients can cause more Python code to run, which
+        // can invalidate the iterator. Extract the vector of patients
+        // from the unordered_map first.
+        patients = std::move(pos->second);
+        internals.patients.erase(pos);
+    });
+
+    instance->has_patients = false;
+    for (PyObject *&patient : patients) {
+        Py_CLEAR(patient);
+    }
+}
+
+/// Clears all internal data from the instance and removes it from registered instances in
+/// preparation for deallocation.
+inline void clear_instance(PyObject *self) {
+    auto *instance = reinterpret_cast<detail::instance *>(self);
+
+    // Deallocate any values/holders, if present:
+    for (auto &v_h : values_and_holders(instance)) {
+        if (v_h) {
+
+            // We have to deregister before we call dealloc because, for virtual MI types, we still
+            // need to be able to get the parent pointers.
+            if (v_h.instance_registered()
+                && !deregister_instance(instance, v_h.value_ptr(), v_h.type)) {
+                pybind11_fail(
+                    "pybind11_object_dealloc(): Tried to deallocate unregistered instance!");
+            }
+
+            if (instance->owned || v_h.holder_constructed()) {
+                v_h.type->dealloc(v_h);
+            }
+        }
+    }
+    // Deallocate the value/holder layout internals:
+    instance->deallocate_layout();
+
+    if (instance->weakrefs) {
+        PyObject_ClearWeakRefs(self);
+    }
+
+    PyObject **dict_ptr = _PyObject_GetDictPtr(self);
+    if (dict_ptr) {
+        Py_CLEAR(*dict_ptr);
+    }
+
+    if (instance->has_patients) {
+        clear_patients(self);
+    }
+}
+
+/// Instance destructor function for all pybind11 types. It calls `type_info.dealloc`
+/// to destroy the C++ object itself, while the rest is Python bookkeeping.
+extern "C" inline void pybind11_object_dealloc(PyObject *self) {
+    auto *type = Py_TYPE(self);
+
+    // If this is a GC tracked object, untrack it first
+    // Note that the track call is implicitly done by the
+    // default tp_alloc, which we never override.
+    if (PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC) != 0) {
+        PyObject_GC_UnTrack(self);
+    }
+
+    clear_instance(self);
+
+    type->tp_free(self);
+
+#if PY_VERSION_HEX < 0x03080000
+    // `type->tp_dealloc != pybind11_object_dealloc` means that we're being called
+    // as part of a derived type's dealloc, in which case we're not allowed to decref
+    // the type here. For cross-module compatibility, we shouldn't compare directly
+    // with `pybind11_object_dealloc`, but with the common one stashed in internals.
+    auto pybind11_object_type = (PyTypeObject *) get_internals().instance_base;
+    if (type->tp_dealloc == pybind11_object_type->tp_dealloc)
+        Py_DECREF(type);
+#else
+    // This was not needed before Python 3.8 (Python issue 35810)
+    // https://github.com/pybind/pybind11/issues/1946
+    Py_DECREF(type);
+#endif
+}
+
+std::string error_string();
+
+/** Create the type which can be used as a common base for all classes.  This is
+    needed in order to satisfy Python's requirements for multiple inheritance.
+    Return value: New reference. */
+inline PyObject *make_object_base_type(PyTypeObject *metaclass) {
+    constexpr auto *name = "pybind11_object";
+    auto name_obj = reinterpret_steal<object>(PYBIND11_FROM_STRING(name));
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type) {
+        pybind11_fail("make_object_base_type(): error allocating type!");
+    }
+
+    heap_type->ht_name = name_obj.inc_ref().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = name_obj.inc_ref().ptr();
+#endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = name;
+    type->tp_base = type_incref(&PyBaseObject_Type);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    type->tp_flags = Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HEAPTYPE;
+
+    type->tp_new = pybind11_object_new;
+    type->tp_init = pybind11_object_init;
+    type->tp_dealloc = pybind11_object_dealloc;
+
+    /* Support weak references (needed for the keep_alive feature) */
+    type->tp_weaklistoffset = offsetof(instance, weakrefs);
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail("PyType_Ready failed in make_object_base_type(): " + error_string());
+    }
+
+    setattr((PyObject *) type, "__module__", str("pybind11_builtins"));
+    PYBIND11_SET_OLDPY_QUALNAME(type, name_obj);
+
+    assert(!PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+    return (PyObject *) heap_type;
+}
+
+/// dynamic_attr: Allow the garbage collector to traverse the internal instance `__dict__`.
+extern "C" inline int pybind11_traverse(PyObject *self, visitproc visit, void *arg) {
+#if PY_VERSION_HEX >= 0x030D0000
+    PyObject_VisitManagedDict(self, visit, arg);
+#else
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_VISIT(dict);
+#endif
+// https://docs.python.org/3/c-api/typeobj.html#c.PyTypeObject.tp_traverse
+#if PY_VERSION_HEX >= 0x03090000
+    Py_VISIT(Py_TYPE(self));
+#endif
+    return 0;
+}
+
+/// dynamic_attr: Allow the GC to clear the dictionary.
+extern "C" inline int pybind11_clear(PyObject *self) {
+#if PY_VERSION_HEX >= 0x030D0000
+    PyObject_ClearManagedDict(self);
+#else
+    PyObject *&dict = *_PyObject_GetDictPtr(self);
+    Py_CLEAR(dict);
+#endif
+    return 0;
+}
+
+/// Give instances of this type a `__dict__` and opt into garbage collection.
+inline void enable_dynamic_attributes(PyHeapTypeObject *heap_type) {
+    auto *type = &heap_type->ht_type;
+    type->tp_flags |= Py_TPFLAGS_HAVE_GC;
+#if PY_VERSION_HEX < 0x030B0000
+    type->tp_dictoffset = type->tp_basicsize;           // place dict at the end
+    type->tp_basicsize += (ssize_t) sizeof(PyObject *); // and allocate enough space for it
+#else
+    type->tp_flags |= Py_TPFLAGS_MANAGED_DICT;
+#endif
+    type->tp_traverse = pybind11_traverse;
+    type->tp_clear = pybind11_clear;
+
+    static PyGetSetDef getset[]
+        = {{"__dict__", PyObject_GenericGetDict, PyObject_GenericSetDict, nullptr, nullptr},
+           {nullptr, nullptr, nullptr, nullptr, nullptr}};
+    type->tp_getset = getset;
+}
+
+/// buffer_protocol: Fill in the view as specified by flags.
+extern "C" inline int pybind11_getbuffer(PyObject *obj, Py_buffer *view, int flags) {
+    // Look for a `get_buffer` implementation in this type's info or any bases (following MRO).
+    type_info *tinfo = nullptr;
+    for (auto type : reinterpret_borrow<tuple>(Py_TYPE(obj)->tp_mro)) {
+        tinfo = get_type_info((PyTypeObject *) type.ptr());
+        if (tinfo && tinfo->get_buffer) {
+            break;
+        }
+    }
+    if (view == nullptr || !tinfo || !tinfo->get_buffer) {
+        if (view) {
+            view->obj = nullptr;
+        }
+        set_error(PyExc_BufferError, "pybind11_getbuffer(): Internal error");
+        return -1;
+    }
+    std::memset(view, 0, sizeof(Py_buffer));
+    buffer_info *info = nullptr;
+    try {
+        info = tinfo->get_buffer(obj, tinfo->get_buffer_data);
+    } catch (...) {
+        try_translate_exceptions();
+        raise_from(PyExc_BufferError, "Error getting buffer");
+        return -1;
+    }
+    if (info == nullptr) {
+        pybind11_fail("FATAL UNEXPECTED SITUATION: tinfo->get_buffer() returned nullptr.");
+    }
+
+    if ((flags & PyBUF_WRITABLE) == PyBUF_WRITABLE && info->readonly) {
+        delete info;
+        // view->obj = nullptr;  // Was just memset to 0, so not necessary
+        set_error(PyExc_BufferError, "Writable buffer requested for readonly storage");
+        return -1;
+    }
+    view->obj = obj;
+    view->ndim = 1;
+    view->internal = info;
+    view->buf = info->ptr;
+    view->itemsize = info->itemsize;
+    view->len = view->itemsize;
+    for (auto s : info->shape) {
+        view->len *= s;
+    }
+    view->readonly = static_cast<int>(info->readonly);
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
+        view->format = const_cast<char *>(info->format.c_str());
+    }
+    if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+        view->ndim = (int) info->ndim;
+        view->strides = info->strides.data();
+        view->shape = info->shape.data();
+    }
+    Py_INCREF(view->obj);
+    return 0;
+}
+
+/// buffer_protocol: Release the resources of the buffer.
+extern "C" inline void pybind11_releasebuffer(PyObject *, Py_buffer *view) {
+    delete (buffer_info *) view->internal;
+}
+
+/// Give this type a buffer interface.
+inline void enable_buffer_protocol(PyHeapTypeObject *heap_type) {
+    heap_type->ht_type.tp_as_buffer = &heap_type->as_buffer;
+
+    heap_type->as_buffer.bf_getbuffer = pybind11_getbuffer;
+    heap_type->as_buffer.bf_releasebuffer = pybind11_releasebuffer;
+}
+
+/** Create a brand new Python type according to the `type_record` specification.
+    Return value: New reference. */
+inline PyObject *make_new_python_type(const type_record &rec) {
+    auto name = reinterpret_steal<object>(PYBIND11_FROM_STRING(rec.name));
+
+    auto qualname = name;
+    if (rec.scope && !PyModule_Check(rec.scope.ptr()) && hasattr(rec.scope, "__qualname__")) {
+        qualname = reinterpret_steal<object>(
+            PyUnicode_FromFormat("%U.%U", rec.scope.attr("__qualname__").ptr(), name.ptr()));
+    }
+
+    object module_;
+    if (rec.scope) {
+        if (hasattr(rec.scope, "__module__")) {
+            module_ = rec.scope.attr("__module__");
+        } else if (hasattr(rec.scope, "__name__")) {
+            module_ = rec.scope.attr("__name__");
+        }
+    }
+
+    const auto *full_name = c_str(
+#if !defined(PYPY_VERSION)
+        module_ ? str(module_).cast<std::string>() + "." + rec.name :
+#endif
+                rec.name);
+
+    char *tp_doc = nullptr;
+    if (rec.doc && options::show_user_defined_docstrings()) {
+        /* Allocate memory for docstring (Python will free this later on) */
+        size_t size = std::strlen(rec.doc) + 1;
+#if PY_VERSION_HEX >= 0x030D0000
+        tp_doc = (char *) PyMem_MALLOC(size);
+#else
+        tp_doc = (char *) PyObject_MALLOC(size);
+#endif
+        std::memcpy((void *) tp_doc, rec.doc, size);
+    }
+
+    auto &internals = get_internals();
+    auto bases = tuple(rec.bases);
+    auto *base = (bases.empty()) ? internals.instance_base : bases[0].ptr();
+
+    /* Danger zone: from now (and until PyType_Ready), make sure to
+       issue no Python C API calls which could potentially invoke the
+       garbage collector (the GC will call type_traverse(), which will in
+       turn find the newly constructed type in an invalid state) */
+    auto *metaclass
+        = rec.metaclass.ptr() ? (PyTypeObject *) rec.metaclass.ptr() : internals.default_metaclass;
+
+    auto *heap_type = (PyHeapTypeObject *) metaclass->tp_alloc(metaclass, 0);
+    if (!heap_type) {
+        pybind11_fail(std::string(rec.name) + ": Unable to create type object!");
+    }
+
+    heap_type->ht_name = name.release().ptr();
+#ifdef PYBIND11_BUILTIN_QUALNAME
+    heap_type->ht_qualname = qualname.inc_ref().ptr();
+#endif
+
+    auto *type = &heap_type->ht_type;
+    type->tp_name = full_name;
+    type->tp_doc = tp_doc;
+    type->tp_base = type_incref((PyTypeObject *) base);
+    type->tp_basicsize = static_cast<ssize_t>(sizeof(instance));
+    if (!bases.empty()) {
+        type->tp_bases = bases.release().ptr();
+    }
+
+    /* Don't inherit base __init__ */
+    type->tp_init = pybind11_object_init;
+
+    /* Supported protocols */
+    type->tp_as_number = &heap_type->as_number;
+    type->tp_as_sequence = &heap_type->as_sequence;
+    type->tp_as_mapping = &heap_type->as_mapping;
+    type->tp_as_async = &heap_type->as_async;
+
+    /* Flags */
+    type->tp_flags |= Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HEAPTYPE;
+    if (!rec.is_final) {
+        type->tp_flags |= Py_TPFLAGS_BASETYPE;
+    }
+
+    if (rec.dynamic_attr) {
+        enable_dynamic_attributes(heap_type);
+    }
+
+    if (rec.buffer_protocol) {
+        enable_buffer_protocol(heap_type);
+    }
+
+    if (rec.custom_type_setup_callback) {
+        rec.custom_type_setup_callback(heap_type);
+    }
+
+    if (PyType_Ready(type) < 0) {
+        pybind11_fail(std::string(rec.name) + ": PyType_Ready failed: " + error_string());
+    }
+
+    assert(!rec.dynamic_attr || PyType_HasFeature(type, Py_TPFLAGS_HAVE_GC));
+
+    /* Register type with the parent scope */
+    if (rec.scope) {
+        setattr(rec.scope, rec.name, (PyObject *) type);
+    } else {
+        Py_INCREF(type); // Keep it alive forever (reference leak)
+    }
+
+    if (module_) { // Needed by pydoc
+        setattr((PyObject *) type, "__module__", module_);
+    }
+
+    PYBIND11_SET_OLDPY_QUALNAME(type, qualname);
+
+    return (PyObject *) type;
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/descr.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/descr.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d546311e78f76f92af13d745b84c3664228fdc9
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/descr.h
@@ -0,0 +1,172 @@
+/*
+    pybind11/detail/descr.h: Helper type for concatenating type signatures at compile time
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if !defined(_MSC_VER)
+#    define PYBIND11_DESCR_CONSTEXPR static constexpr
+#else
+#    define PYBIND11_DESCR_CONSTEXPR const
+#endif
+
+/* Concatenate type signatures at compile time */
+template <size_t N, typename... Ts>
+struct descr {
+    char text[N + 1]{'\0'};
+
+    constexpr descr() = default;
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    constexpr descr(char const (&s)[N + 1]) : descr(s, make_index_sequence<N>()) {}
+
+    template <size_t... Is>
+    constexpr descr(char const (&s)[N + 1], index_sequence<Is...>) : text{s[Is]..., '\0'} {}
+
+    template <typename... Chars>
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    constexpr descr(char c, Chars... cs) : text{c, static_cast<char>(cs)..., '\0'} {}
+
+    static constexpr std::array<const std::type_info *, sizeof...(Ts) + 1> types() {
+        return {{&typeid(Ts)..., nullptr}};
+    }
+};
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2, size_t... Is1, size_t... Is2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> plus_impl(const descr<N1, Ts1...> &a,
+                                                   const descr<N2, Ts2...> &b,
+                                                   index_sequence<Is1...>,
+                                                   index_sequence<Is2...>) {
+    PYBIND11_WORKAROUND_INCORRECT_MSVC_C4100(b);
+    return {a.text[Is1]..., b.text[Is2]...};
+}
+
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2, Ts1..., Ts2...> operator+(const descr<N1, Ts1...> &a,
+                                                   const descr<N2, Ts2...> &b) {
+    return plus_impl(a, b, make_index_sequence<N1>(), make_index_sequence<N2>());
+}
+
+template <size_t N>
+constexpr descr<N - 1> const_name(char const (&text)[N]) {
+    return descr<N - 1>(text);
+}
+constexpr descr<0> const_name(char const (&)[1]) { return {}; }
+
+template <size_t Rem, size_t... Digits>
+struct int_to_str : int_to_str<Rem / 10, Rem % 10, Digits...> {};
+template <size_t... Digits>
+struct int_to_str<0, Digits...> {
+    // WARNING: This only works with C++17 or higher.
+    static constexpr auto digits = descr<sizeof...(Digits)>(('0' + Digits)...);
+};
+
+// Ternary description (like std::conditional)
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> const_name(char const (&text1)[N1], char const (&)[N2]) {
+    return const_name(text1);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> const_name(char const (&)[N1], char const (&text2)[N2]) {
+    return const_name(text2);
+}
+
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> const_name(const T1 &d, const T2 &) {
+    return d;
+}
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> const_name(const T1 &, const T2 &d) {
+    return d;
+}
+
+template <size_t Size>
+auto constexpr const_name() -> remove_cv_t<decltype(int_to_str<Size / 10, Size % 10>::digits)> {
+    return int_to_str<Size / 10, Size % 10>::digits;
+}
+
+template <typename Type>
+constexpr descr<1, Type> const_name() {
+    return {'%'};
+}
+
+// If "_" is defined as a macro, py::detail::_ cannot be provided.
+// It is therefore best to use py::detail::const_name universally.
+// This block is for backward compatibility only.
+// (The const_name code is repeated to avoid introducing a "_" #define ourselves.)
+#ifndef _
+#    define PYBIND11_DETAIL_UNDERSCORE_BACKWARD_COMPATIBILITY
+template <size_t N>
+constexpr descr<N - 1> _(char const (&text)[N]) {
+    return const_name<N>(text);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<B, descr<N1 - 1>> _(char const (&text1)[N1], char const (&text2)[N2]) {
+    return const_name<B, N1, N2>(text1, text2);
+}
+template <bool B, size_t N1, size_t N2>
+constexpr enable_if_t<!B, descr<N2 - 1>> _(char const (&text1)[N1], char const (&text2)[N2]) {
+    return const_name<B, N1, N2>(text1, text2);
+}
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<B, T1> _(const T1 &d1, const T2 &d2) {
+    return const_name<B, T1, T2>(d1, d2);
+}
+template <bool B, typename T1, typename T2>
+constexpr enable_if_t<!B, T2> _(const T1 &d1, const T2 &d2) {
+    return const_name<B, T1, T2>(d1, d2);
+}
+
+template <size_t Size>
+auto constexpr _() -> remove_cv_t<decltype(int_to_str<Size / 10, Size % 10>::digits)> {
+    return const_name<Size>();
+}
+template <typename Type>
+constexpr descr<1, Type> _() {
+    return const_name<Type>();
+}
+#endif // #ifndef _
+
+constexpr descr<0> concat() { return {}; }
+
+template <size_t N, typename... Ts>
+constexpr descr<N, Ts...> concat(const descr<N, Ts...> &descr) {
+    return descr;
+}
+
+#ifdef __cpp_fold_expressions
+template <size_t N1, size_t N2, typename... Ts1, typename... Ts2>
+constexpr descr<N1 + N2 + 2, Ts1..., Ts2...> operator,(const descr<N1, Ts1...> &a,
+                                                       const descr<N2, Ts2...> &b) {
+    return a + const_name(", ") + b;
+}
+
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d, const Args &...args) {
+    return (d, ..., args);
+}
+#else
+template <size_t N, typename... Ts, typename... Args>
+constexpr auto concat(const descr<N, Ts...> &d,
+                      const Args &...args) -> decltype(std::declval<descr<N + 2, Ts...>>()
+                                                       + concat(args...)) {
+    return d + const_name(", ") + concat(args...);
+}
+#endif
+
+template <size_t N, typename... Ts>
+constexpr descr<N + 2, Ts...> type_descr(const descr<N, Ts...> &descr) {
+    return const_name("{") + descr + const_name("}");
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/exception_translation.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/exception_translation.h
new file mode 100644
index 0000000000000000000000000000000000000000..2764180bb078c4ed2b9648de9d6752040256d295
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/exception_translation.h
@@ -0,0 +1,71 @@
+/*
+    pybind11/detail/exception_translation.h: means to translate C++ exceptions to Python exceptions
+
+    Copyright (c) 2024 The Pybind Development Team.
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "common.h"
+#include "internals.h"
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+// Apply all the extensions translators from a list
+// Return true if one of the translators completed without raising an exception
+// itself. Return of false indicates that if there are other translators
+// available, they should be tried.
+inline bool apply_exception_translators(std::forward_list<ExceptionTranslator> &translators) {
+    auto last_exception = std::current_exception();
+
+    for (auto &translator : translators) {
+        try {
+            translator(last_exception);
+            return true;
+        } catch (...) {
+            last_exception = std::current_exception();
+        }
+    }
+    return false;
+}
+
+inline void try_translate_exceptions() {
+    /* When an exception is caught, give each registered exception
+        translator a chance to translate it to a Python exception. First
+        all module-local translators will be tried in reverse order of
+        registration. If none of the module-locale translators handle
+        the exception (or there are no module-locale translators) then
+        the global translators will be tried, also in reverse order of
+        registration.
+
+        A translator may choose to do one of the following:
+
+        - catch the exception and call py::set_error()
+            to set a standard (or custom) Python exception, or
+        - do nothing and let the exception fall through to the next translator, or
+        - delegate translation to the next translator by throwing a new type of exception.
+        */
+
+    bool handled = with_internals([&](internals &internals) {
+        auto &local_exception_translators = get_local_internals().registered_exception_translators;
+        if (detail::apply_exception_translators(local_exception_translators)) {
+            return true;
+        }
+        auto &exception_translators = internals.registered_exception_translators;
+        if (detail::apply_exception_translators(exception_translators)) {
+            return true;
+        }
+        return false;
+    });
+
+    if (!handled) {
+        set_error(PyExc_SystemError, "Exception escaped from default exception translator!");
+    }
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/type_caster_base.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/type_caster_base.h
new file mode 100644
index 0000000000000000000000000000000000000000..e40e44ba6cc9a92d0e5a697a2478477f78320386
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/detail/type_caster_base.h
@@ -0,0 +1,1195 @@
+/*
+    pybind11/detail/type_caster_base.h (originally first part of pybind11/cast.h)
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <pybind11/pytypes.h>
+
+#include "common.h"
+#include "cpp_conduit.h"
+#include "descr.h"
+#include "internals.h"
+#include "typeid.h"
+#include "value_and_holder.h"
+
+#include <cstdint>
+#include <cstring>
+#include <iterator>
+#include <new>
+#include <stdexcept>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <typeinfo>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// A life support system for temporary objects created by `type_caster::load()`.
+/// Adding a patient will keep it alive up until the enclosing function returns.
+class loader_life_support {
+private:
+    loader_life_support *parent = nullptr;
+    std::unordered_set<PyObject *> keep_alive;
+
+    // Store stack pointer in thread-local storage.
+    static PYBIND11_TLS_KEY_REF get_stack_tls_key() {
+#if PYBIND11_INTERNALS_VERSION == 4
+        return get_local_internals().loader_life_support_tls_key;
+#else
+        return get_internals().loader_life_support_tls_key;
+#endif
+    }
+    static loader_life_support *get_stack_top() {
+        return static_cast<loader_life_support *>(PYBIND11_TLS_GET_VALUE(get_stack_tls_key()));
+    }
+    static void set_stack_top(loader_life_support *value) {
+        PYBIND11_TLS_REPLACE_VALUE(get_stack_tls_key(), value);
+    }
+
+public:
+    /// A new patient frame is created when a function is entered
+    loader_life_support() : parent{get_stack_top()} { set_stack_top(this); }
+
+    /// ... and destroyed after it returns
+    ~loader_life_support() {
+        if (get_stack_top() != this) {
+            pybind11_fail("loader_life_support: internal error");
+        }
+        set_stack_top(parent);
+        for (auto *item : keep_alive) {
+            Py_DECREF(item);
+        }
+    }
+
+    /// This can only be used inside a pybind11-bound function, either by `argument_loader`
+    /// at argument preparation time or by `py::cast()` at execution time.
+    PYBIND11_NOINLINE static void add_patient(handle h) {
+        loader_life_support *frame = get_stack_top();
+        if (!frame) {
+            // NOTE: It would be nice to include the stack frames here, as this indicates
+            // use of pybind11::cast<> outside the normal call framework, finding such
+            // a location is challenging. Developers could consider printing out
+            // stack frame addresses here using something like __builtin_frame_address(0)
+            throw cast_error("When called outside a bound function, py::cast() cannot "
+                             "do Python -> C++ conversions which require the creation "
+                             "of temporary values");
+        }
+
+        if (frame->keep_alive.insert(h.ptr()).second) {
+            Py_INCREF(h.ptr());
+        }
+    }
+};
+
+// Gets the cache entry for the given type, creating it if necessary.  The return value is the pair
+// returned by emplace, i.e. an iterator for the entry and a bool set to `true` if the entry was
+// just created.
+inline std::pair<decltype(internals::registered_types_py)::iterator, bool>
+all_type_info_get_cache(PyTypeObject *type);
+
+// Band-aid workaround to fix a subtle but serious bug in a minimalistic fashion. See PR #4762.
+inline void all_type_info_add_base_most_derived_first(std::vector<type_info *> &bases,
+                                                      type_info *addl_base) {
+    for (auto it = bases.begin(); it != bases.end(); it++) {
+        type_info *existing_base = *it;
+        if (PyType_IsSubtype(addl_base->type, existing_base->type) != 0) {
+            bases.insert(it, addl_base);
+            return;
+        }
+    }
+    bases.push_back(addl_base);
+}
+
+// Populates a just-created cache entry.
+PYBIND11_NOINLINE void all_type_info_populate(PyTypeObject *t, std::vector<type_info *> &bases) {
+    assert(bases.empty());
+    std::vector<PyTypeObject *> check;
+    for (handle parent : reinterpret_borrow<tuple>(t->tp_bases)) {
+        check.push_back((PyTypeObject *) parent.ptr());
+    }
+
+    auto const &type_dict = get_internals().registered_types_py;
+    for (size_t i = 0; i < check.size(); i++) {
+        auto *type = check[i];
+        // Ignore Python2 old-style class super type:
+        if (!PyType_Check((PyObject *) type)) {
+            continue;
+        }
+
+        // Check `type` in the current set of registered python types:
+        auto it = type_dict.find(type);
+        if (it != type_dict.end()) {
+            // We found a cache entry for it, so it's either pybind-registered or has pre-computed
+            // pybind bases, but we have to make sure we haven't already seen the type(s) before:
+            // we want to follow Python/virtual C++ rules that there should only be one instance of
+            // a common base.
+            for (auto *tinfo : it->second) {
+                // NB: Could use a second set here, rather than doing a linear search, but since
+                // having a large number of immediate pybind11-registered types seems fairly
+                // unlikely, that probably isn't worthwhile.
+                bool found = false;
+                for (auto *known : bases) {
+                    if (known == tinfo) {
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    all_type_info_add_base_most_derived_first(bases, tinfo);
+                }
+            }
+        } else if (type->tp_bases) {
+            // It's some python type, so keep follow its bases classes to look for one or more
+            // registered types
+            if (i + 1 == check.size()) {
+                // When we're at the end, we can pop off the current element to avoid growing
+                // `check` when adding just one base (which is typical--i.e. when there is no
+                // multiple inheritance)
+                check.pop_back();
+                i--;
+            }
+            for (handle parent : reinterpret_borrow<tuple>(type->tp_bases)) {
+                check.push_back((PyTypeObject *) parent.ptr());
+            }
+        }
+    }
+}
+
+/**
+ * Extracts vector of type_info pointers of pybind-registered roots of the given Python type.  Will
+ * be just 1 pybind type for the Python type of a pybind-registered class, or for any Python-side
+ * derived class that uses single inheritance.  Will contain as many types as required for a Python
+ * class that uses multiple inheritance to inherit (directly or indirectly) from multiple
+ * pybind-registered classes.  Will be empty if neither the type nor any base classes are
+ * pybind-registered.
+ *
+ * The value is cached for the lifetime of the Python type.
+ */
+inline const std::vector<detail::type_info *> &all_type_info(PyTypeObject *type) {
+    auto ins = all_type_info_get_cache(type);
+    if (ins.second) {
+        // New cache entry: populate it
+        all_type_info_populate(type, ins.first->second);
+    }
+
+    return ins.first->second;
+}
+
+/**
+ * Gets a single pybind11 type info for a python type.  Returns nullptr if neither the type nor any
+ * ancestors are pybind11-registered.  Throws an exception if there are multiple bases--use
+ * `all_type_info` instead if you want to support multiple bases.
+ */
+PYBIND11_NOINLINE detail::type_info *get_type_info(PyTypeObject *type) {
+    const auto &bases = all_type_info(type);
+    if (bases.empty()) {
+        return nullptr;
+    }
+    if (bases.size() > 1) {
+        pybind11_fail(
+            "pybind11::detail::get_type_info: type has multiple pybind11-registered bases");
+    }
+    return bases.front();
+}
+
+inline detail::type_info *get_local_type_info(const std::type_index &tp) {
+    auto &locals = get_local_internals().registered_types_cpp;
+    auto it = locals.find(tp);
+    if (it != locals.end()) {
+        return it->second;
+    }
+    return nullptr;
+}
+
+inline detail::type_info *get_global_type_info(const std::type_index &tp) {
+    return with_internals([&](internals &internals) {
+        detail::type_info *type_info = nullptr;
+        auto &types = internals.registered_types_cpp;
+        auto it = types.find(tp);
+        if (it != types.end()) {
+            type_info = it->second;
+        }
+        return type_info;
+    });
+}
+
+/// Return the type info for a given C++ type; on lookup failure can either throw or return
+/// nullptr.
+PYBIND11_NOINLINE detail::type_info *get_type_info(const std::type_index &tp,
+                                                   bool throw_if_missing = false) {
+    if (auto *ltype = get_local_type_info(tp)) {
+        return ltype;
+    }
+    if (auto *gtype = get_global_type_info(tp)) {
+        return gtype;
+    }
+
+    if (throw_if_missing) {
+        std::string tname = tp.name();
+        detail::clean_type_id(tname);
+        pybind11_fail("pybind11::detail::get_type_info: unable to find type info for \""
+                      + std::move(tname) + '"');
+    }
+    return nullptr;
+}
+
+PYBIND11_NOINLINE handle get_type_handle(const std::type_info &tp, bool throw_if_missing) {
+    detail::type_info *type_info = get_type_info(tp, throw_if_missing);
+    return handle(type_info ? ((PyObject *) type_info->type) : nullptr);
+}
+
+// Searches the inheritance graph for a registered Python instance, using all_type_info().
+PYBIND11_NOINLINE handle find_registered_python_instance(void *src,
+                                                         const detail::type_info *tinfo) {
+    return with_instance_map(src, [&](instance_map &instances) {
+        auto it_instances = instances.equal_range(src);
+        for (auto it_i = it_instances.first; it_i != it_instances.second; ++it_i) {
+            for (auto *instance_type : detail::all_type_info(Py_TYPE(it_i->second))) {
+                if (instance_type && same_type(*instance_type->cpptype, *tinfo->cpptype)) {
+                    return handle((PyObject *) it_i->second).inc_ref();
+                }
+            }
+        }
+        return handle();
+    });
+}
+
+// Container for accessing and iterating over an instance's values/holders
+struct values_and_holders {
+private:
+    instance *inst;
+    using type_vec = std::vector<detail::type_info *>;
+    const type_vec &tinfo;
+
+public:
+    explicit values_and_holders(instance *inst)
+        : inst{inst}, tinfo(all_type_info(Py_TYPE(inst))) {}
+
+    explicit values_and_holders(PyObject *obj)
+        : inst{nullptr}, tinfo(all_type_info(Py_TYPE(obj))) {
+        if (!tinfo.empty()) {
+            inst = reinterpret_cast<instance *>(obj);
+        }
+    }
+
+    struct iterator {
+    private:
+        instance *inst = nullptr;
+        const type_vec *types = nullptr;
+        value_and_holder curr;
+        friend struct values_and_holders;
+        iterator(instance *inst, const type_vec *tinfo) : inst{inst}, types{tinfo} {
+            if (inst != nullptr) {
+                assert(!types->empty());
+                curr = value_and_holder(
+                    inst /* instance */,
+                    (*types)[0] /* type info */,
+                    0, /* vpos: (non-simple types only): the first vptr comes first */
+                    0 /* index */);
+            }
+        }
+        // Past-the-end iterator:
+        explicit iterator(size_t end) : curr(end) {}
+
+    public:
+        bool operator==(const iterator &other) const { return curr.index == other.curr.index; }
+        bool operator!=(const iterator &other) const { return curr.index != other.curr.index; }
+        iterator &operator++() {
+            if (!inst->simple_layout) {
+                curr.vh += 1 + (*types)[curr.index]->holder_size_in_ptrs;
+            }
+            ++curr.index;
+            curr.type = curr.index < types->size() ? (*types)[curr.index] : nullptr;
+            return *this;
+        }
+        value_and_holder &operator*() { return curr; }
+        value_and_holder *operator->() { return &curr; }
+    };
+
+    iterator begin() { return iterator(inst, &tinfo); }
+    iterator end() { return iterator(tinfo.size()); }
+
+    iterator find(const type_info *find_type) {
+        auto it = begin(), endit = end();
+        while (it != endit && it->type != find_type) {
+            ++it;
+        }
+        return it;
+    }
+
+    size_t size() { return tinfo.size(); }
+
+    // Band-aid workaround to fix a subtle but serious bug in a minimalistic fashion. See PR #4762.
+    bool is_redundant_value_and_holder(const value_and_holder &vh) {
+        for (size_t i = 0; i < vh.index; i++) {
+            if (PyType_IsSubtype(tinfo[i]->type, tinfo[vh.index]->type) != 0) {
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+/**
+ * Extracts C++ value and holder pointer references from an instance (which may contain multiple
+ * values/holders for python-side multiple inheritance) that match the given type.  Throws an error
+ * if the given type (or ValueType, if omitted) is not a pybind11 base of the given instance.  If
+ * `find_type` is omitted (or explicitly specified as nullptr) the first value/holder are returned,
+ * regardless of type (and the resulting .type will be nullptr).
+ *
+ * The returned object should be short-lived: in particular, it must not outlive the called-upon
+ * instance.
+ */
+PYBIND11_NOINLINE value_and_holder
+instance::get_value_and_holder(const type_info *find_type /*= nullptr default in common.h*/,
+                               bool throw_if_missing /*= true in common.h*/) {
+    // Optimize common case:
+    if (!find_type || Py_TYPE(this) == find_type->type) {
+        return value_and_holder(this, find_type, 0, 0);
+    }
+
+    detail::values_and_holders vhs(this);
+    auto it = vhs.find(find_type);
+    if (it != vhs.end()) {
+        return *it;
+    }
+
+    if (!throw_if_missing) {
+        return value_and_holder();
+    }
+
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+    pybind11_fail("pybind11::detail::instance::get_value_and_holder: `"
+                  + get_fully_qualified_tp_name(find_type->type)
+                  + "' is not a pybind11 base of the given `"
+                  + get_fully_qualified_tp_name(Py_TYPE(this)) + "' instance");
+#else
+    pybind11_fail(
+        "pybind11::detail::instance::get_value_and_holder: "
+        "type is not a pybind11 base of the given instance "
+        "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in debug mode for type details)");
+#endif
+}
+
+PYBIND11_NOINLINE void instance::allocate_layout() {
+    const auto &tinfo = all_type_info(Py_TYPE(this));
+
+    const size_t n_types = tinfo.size();
+
+    if (n_types == 0) {
+        pybind11_fail(
+            "instance allocation failed: new instance has no pybind11-registered base types");
+    }
+
+    simple_layout
+        = n_types == 1 && tinfo.front()->holder_size_in_ptrs <= instance_simple_holder_in_ptrs();
+
+    // Simple path: no python-side multiple inheritance, and a small-enough holder
+    if (simple_layout) {
+        simple_value_holder[0] = nullptr;
+        simple_holder_constructed = false;
+        simple_instance_registered = false;
+    } else { // multiple base types or a too-large holder
+        // Allocate space to hold: [v1*][h1][v2*][h2]...[bb...] where [vN*] is a value pointer,
+        // [hN] is the (uninitialized) holder instance for value N, and [bb...] is a set of bool
+        // values that tracks whether each associated holder has been initialized.  Each [block] is
+        // padded, if necessary, to an integer multiple of sizeof(void *).
+        size_t space = 0;
+        for (auto *t : tinfo) {
+            space += 1;                      // value pointer
+            space += t->holder_size_in_ptrs; // holder instance
+        }
+        size_t flags_at = space;
+        space += size_in_ptrs(n_types); // status bytes (holder_constructed and
+                                        // instance_registered)
+
+        // Allocate space for flags, values, and holders, and initialize it to 0 (flags and values,
+        // in particular, need to be 0).  Use Python's memory allocation
+        // functions: Python is using pymalloc, which is designed to be
+        // efficient for small allocations like the one we're doing here;
+        // for larger allocations they are just wrappers around malloc.
+        // TODO: is this still true for pure Python 3.6?
+        nonsimple.values_and_holders = (void **) PyMem_Calloc(space, sizeof(void *));
+        if (!nonsimple.values_and_holders) {
+            throw std::bad_alloc();
+        }
+        nonsimple.status
+            = reinterpret_cast<std::uint8_t *>(&nonsimple.values_and_holders[flags_at]);
+    }
+    owned = true;
+}
+
+// NOLINTNEXTLINE(readability-make-member-function-const)
+PYBIND11_NOINLINE void instance::deallocate_layout() {
+    if (!simple_layout) {
+        PyMem_Free(reinterpret_cast<void *>(nonsimple.values_and_holders));
+    }
+}
+
+PYBIND11_NOINLINE bool isinstance_generic(handle obj, const std::type_info &tp) {
+    handle type = detail::get_type_handle(tp, false);
+    if (!type) {
+        return false;
+    }
+    return isinstance(obj, type);
+}
+
+PYBIND11_NOINLINE handle get_object_handle(const void *ptr, const detail::type_info *type) {
+    return with_instance_map(ptr, [&](instance_map &instances) {
+        auto range = instances.equal_range(ptr);
+        for (auto it = range.first; it != range.second; ++it) {
+            for (const auto &vh : values_and_holders(it->second)) {
+                if (vh.type == type) {
+                    return handle((PyObject *) it->second);
+                }
+            }
+        }
+        return handle();
+    });
+}
+
+inline PyThreadState *get_thread_state_unchecked() {
+#if defined(PYPY_VERSION)
+    return PyThreadState_GET();
+#elif PY_VERSION_HEX < 0x030D0000
+    return _PyThreadState_UncheckedGet();
+#else
+    return PyThreadState_GetUnchecked();
+#endif
+}
+
+// Forward declarations
+void keep_alive_impl(handle nurse, handle patient);
+inline PyObject *make_new_instance(PyTypeObject *type);
+
+class type_caster_generic {
+public:
+    PYBIND11_NOINLINE explicit type_caster_generic(const std::type_info &type_info)
+        : typeinfo(get_type_info(type_info)), cpptype(&type_info) {}
+
+    explicit type_caster_generic(const type_info *typeinfo)
+        : typeinfo(typeinfo), cpptype(typeinfo ? typeinfo->cpptype : nullptr) {}
+
+    bool load(handle src, bool convert) { return load_impl<type_caster_generic>(src, convert); }
+
+    PYBIND11_NOINLINE static handle cast(const void *_src,
+                                         return_value_policy policy,
+                                         handle parent,
+                                         const detail::type_info *tinfo,
+                                         void *(*copy_constructor)(const void *),
+                                         void *(*move_constructor)(const void *),
+                                         const void *existing_holder = nullptr) {
+        if (!tinfo) { // no type info: error will be set already
+            return handle();
+        }
+
+        void *src = const_cast<void *>(_src);
+        if (src == nullptr) {
+            return none().release();
+        }
+
+        if (handle registered_inst = find_registered_python_instance(src, tinfo)) {
+            return registered_inst;
+        }
+
+        auto inst = reinterpret_steal<object>(make_new_instance(tinfo->type));
+        auto *wrapper = reinterpret_cast<instance *>(inst.ptr());
+        wrapper->owned = false;
+        void *&valueptr = values_and_holders(wrapper).begin()->value_ptr();
+
+        switch (policy) {
+            case return_value_policy::automatic:
+            case return_value_policy::take_ownership:
+                valueptr = src;
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::automatic_reference:
+            case return_value_policy::reference:
+                valueptr = src;
+                wrapper->owned = false;
+                break;
+
+            case return_value_policy::copy:
+                if (copy_constructor) {
+                    valueptr = copy_constructor(src);
+                } else {
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = copy, but type " + type_name
+                                     + " is non-copyable!");
+#else
+                    throw cast_error("return_value_policy = copy, but type is "
+                                     "non-copyable! (#define PYBIND11_DETAILED_ERROR_MESSAGES or "
+                                     "compile in debug mode for details)");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::move:
+                if (move_constructor) {
+                    valueptr = move_constructor(src);
+                } else if (copy_constructor) {
+                    valueptr = copy_constructor(src);
+                } else {
+#if defined(PYBIND11_DETAILED_ERROR_MESSAGES)
+                    std::string type_name(tinfo->cpptype->name());
+                    detail::clean_type_id(type_name);
+                    throw cast_error("return_value_policy = move, but type " + type_name
+                                     + " is neither movable nor copyable!");
+#else
+                    throw cast_error("return_value_policy = move, but type is neither "
+                                     "movable nor copyable! "
+                                     "(#define PYBIND11_DETAILED_ERROR_MESSAGES or compile in "
+                                     "debug mode for details)");
+#endif
+                }
+                wrapper->owned = true;
+                break;
+
+            case return_value_policy::reference_internal:
+                valueptr = src;
+                wrapper->owned = false;
+                keep_alive_impl(inst, parent);
+                break;
+
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        }
+
+        tinfo->init_instance(wrapper, existing_holder);
+
+        return inst.release();
+    }
+
+    // Base methods for generic caster; there are overridden in copyable_holder_caster
+    void load_value(value_and_holder &&v_h) {
+        auto *&vptr = v_h.value_ptr();
+        // Lazy allocation for unallocated values:
+        if (vptr == nullptr) {
+            const auto *type = v_h.type ? v_h.type : typeinfo;
+            if (type->operator_new) {
+                vptr = type->operator_new(type->type_size);
+            } else {
+#if defined(__cpp_aligned_new) && (!defined(_MSC_VER) || _MSC_VER >= 1912)
+                if (type->type_align > __STDCPP_DEFAULT_NEW_ALIGNMENT__) {
+                    vptr = ::operator new(type->type_size, std::align_val_t(type->type_align));
+                } else {
+                    vptr = ::operator new(type->type_size);
+                }
+#else
+                vptr = ::operator new(type->type_size);
+#endif
+            }
+        }
+        value = vptr;
+    }
+    bool try_implicit_casts(handle src, bool convert) {
+        for (const auto &cast : typeinfo->implicit_casts) {
+            type_caster_generic sub_caster(*cast.first);
+            if (sub_caster.load(src, convert)) {
+                value = cast.second(sub_caster.value);
+                return true;
+            }
+        }
+        return false;
+    }
+    bool try_direct_conversions(handle src) {
+        for (auto &converter : *typeinfo->direct_conversions) {
+            if (converter(src.ptr(), value)) {
+                return true;
+            }
+        }
+        return false;
+    }
+    bool try_cpp_conduit(handle src) {
+        value = try_raw_pointer_ephemeral_from_cpp_conduit(src, cpptype);
+        if (value != nullptr) {
+            return true;
+        }
+        return false;
+    }
+    void check_holder_compat() {}
+
+    PYBIND11_NOINLINE static void *local_load(PyObject *src, const type_info *ti) {
+        auto caster = type_caster_generic(ti);
+        if (caster.load(src, false)) {
+            return caster.value;
+        }
+        return nullptr;
+    }
+
+    /// Try to load with foreign typeinfo, if available. Used when there is no
+    /// native typeinfo, or when the native one wasn't able to produce a value.
+    PYBIND11_NOINLINE bool try_load_foreign_module_local(handle src) {
+        constexpr auto *local_key = PYBIND11_MODULE_LOCAL_ID;
+        const auto pytype = type::handle_of(src);
+        if (!hasattr(pytype, local_key)) {
+            return false;
+        }
+
+        type_info *foreign_typeinfo = reinterpret_borrow<capsule>(getattr(pytype, local_key));
+        // Only consider this foreign loader if actually foreign and is a loader of the correct cpp
+        // type
+        if (foreign_typeinfo->module_local_load == &local_load
+            || (cpptype && !same_type(*cpptype, *foreign_typeinfo->cpptype))) {
+            return false;
+        }
+
+        if (auto *result = foreign_typeinfo->module_local_load(src.ptr(), foreign_typeinfo)) {
+            value = result;
+            return true;
+        }
+        return false;
+    }
+
+    // Implementation of `load`; this takes the type of `this` so that it can dispatch the relevant
+    // bits of code between here and copyable_holder_caster where the two classes need different
+    // logic (without having to resort to virtual inheritance).
+    template <typename ThisT>
+    PYBIND11_NOINLINE bool load_impl(handle src, bool convert) {
+        if (!src) {
+            return false;
+        }
+        if (!typeinfo) {
+            return try_load_foreign_module_local(src);
+        }
+
+        auto &this_ = static_cast<ThisT &>(*this);
+        this_.check_holder_compat();
+
+        PyTypeObject *srctype = Py_TYPE(src.ptr());
+
+        // Case 1: If src is an exact type match for the target type then we can reinterpret_cast
+        // the instance's value pointer to the target type:
+        if (srctype == typeinfo->type) {
+            this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+            return true;
+        }
+        // Case 2: We have a derived class
+        if (PyType_IsSubtype(srctype, typeinfo->type)) {
+            const auto &bases = all_type_info(srctype);
+            bool no_cpp_mi = typeinfo->simple_type;
+
+            // Case 2a: the python type is a Python-inherited derived class that inherits from just
+            // one simple (no MI) pybind11 class, or is an exact match, so the C++ instance is of
+            // the right type and we can use reinterpret_cast.
+            // (This is essentially the same as case 2b, but because not using multiple inheritance
+            // is extremely common, we handle it specially to avoid the loop iterator and type
+            // pointer lookup overhead)
+            if (bases.size() == 1 && (no_cpp_mi || bases.front()->type == typeinfo->type)) {
+                this_.load_value(reinterpret_cast<instance *>(src.ptr())->get_value_and_holder());
+                return true;
+            }
+            // Case 2b: the python type inherits from multiple C++ bases.  Check the bases to see
+            // if we can find an exact match (or, for a simple C++ type, an inherited match); if
+            // so, we can safely reinterpret_cast to the relevant pointer.
+            if (bases.size() > 1) {
+                for (auto *base : bases) {
+                    if (no_cpp_mi ? PyType_IsSubtype(base->type, typeinfo->type)
+                                  : base->type == typeinfo->type) {
+                        this_.load_value(
+                            reinterpret_cast<instance *>(src.ptr())->get_value_and_holder(base));
+                        return true;
+                    }
+                }
+            }
+
+            // Case 2c: C++ multiple inheritance is involved and we couldn't find an exact type
+            // match in the registered bases, above, so try implicit casting (needed for proper C++
+            // casting when MI is involved).
+            if (this_.try_implicit_casts(src, convert)) {
+                return true;
+            }
+        }
+
+        // Perform an implicit conversion
+        if (convert) {
+            for (const auto &converter : typeinfo->implicit_conversions) {
+                auto temp = reinterpret_steal<object>(converter(src.ptr(), typeinfo->type));
+                if (load_impl<ThisT>(temp, false)) {
+                    loader_life_support::add_patient(temp);
+                    return true;
+                }
+            }
+            if (this_.try_direct_conversions(src)) {
+                return true;
+            }
+        }
+
+        // Failed to match local typeinfo. Try again with global.
+        if (typeinfo->module_local) {
+            if (auto *gtype = get_global_type_info(*typeinfo->cpptype)) {
+                typeinfo = gtype;
+                return load(src, false);
+            }
+        }
+
+        // Global typeinfo has precedence over foreign module_local
+        if (try_load_foreign_module_local(src)) {
+            return true;
+        }
+
+        // Custom converters didn't take None, now we convert None to nullptr.
+        if (src.is_none()) {
+            // Defer accepting None to other overloads (if we aren't in convert mode):
+            if (!convert) {
+                return false;
+            }
+            value = nullptr;
+            return true;
+        }
+
+        if (convert && cpptype && this_.try_cpp_conduit(src)) {
+            return true;
+        }
+
+        return false;
+    }
+
+    // Called to do type lookup and wrap the pointer and type in a pair when a dynamic_cast
+    // isn't needed or can't be used.  If the type is unknown, sets the error and returns a pair
+    // with .second = nullptr.  (p.first = nullptr is not an error: it becomes None).
+    PYBIND11_NOINLINE static std::pair<const void *, const type_info *>
+    src_and_type(const void *src,
+                 const std::type_info &cast_type,
+                 const std::type_info *rtti_type = nullptr) {
+        if (auto *tpi = get_type_info(cast_type)) {
+            return {src, const_cast<const type_info *>(tpi)};
+        }
+
+        // Not found, set error:
+        std::string tname = rtti_type ? rtti_type->name() : cast_type.name();
+        detail::clean_type_id(tname);
+        std::string msg = "Unregistered type : " + tname;
+        set_error(PyExc_TypeError, msg.c_str());
+        return {nullptr, nullptr};
+    }
+
+    const type_info *typeinfo = nullptr;
+    const std::type_info *cpptype = nullptr;
+    void *value = nullptr;
+};
+
+inline object cpp_conduit_method(handle self,
+                                 const bytes &pybind11_platform_abi_id,
+                                 const capsule &cpp_type_info_capsule,
+                                 const bytes &pointer_kind) {
+#ifdef PYBIND11_HAS_STRING_VIEW
+    using cpp_str = std::string_view;
+#else
+    using cpp_str = std::string;
+#endif
+    if (cpp_str(pybind11_platform_abi_id) != PYBIND11_PLATFORM_ABI_ID) {
+        return none();
+    }
+    if (std::strcmp(cpp_type_info_capsule.name(), typeid(std::type_info).name()) != 0) {
+        return none();
+    }
+    if (cpp_str(pointer_kind) != "raw_pointer_ephemeral") {
+        throw std::runtime_error("Invalid pointer_kind: \"" + std::string(pointer_kind) + "\"");
+    }
+    const auto *cpp_type_info = cpp_type_info_capsule.get_pointer<const std::type_info>();
+    type_caster_generic caster(*cpp_type_info);
+    if (!caster.load(self, false)) {
+        return none();
+    }
+    return capsule(caster.value, cpp_type_info->name());
+}
+
+/**
+ * Determine suitable casting operator for pointer-or-lvalue-casting type casters.  The type caster
+ * needs to provide `operator T*()` and `operator T&()` operators.
+ *
+ * If the type supports moving the value away via an `operator T&&() &&` method, it should use
+ * `movable_cast_op_type` instead.
+ */
+template <typename T>
+using cast_op_type = conditional_t<std::is_pointer<remove_reference_t<T>>::value,
+                                   typename std::add_pointer<intrinsic_t<T>>::type,
+                                   typename std::add_lvalue_reference<intrinsic_t<T>>::type>;
+
+/**
+ * Determine suitable casting operator for a type caster with a movable value.  Such a type caster
+ * needs to provide `operator T*()`, `operator T&()`, and `operator T&&() &&`.  The latter will be
+ * called in appropriate contexts where the value can be moved rather than copied.
+ *
+ * These operator are automatically provided when using the PYBIND11_TYPE_CASTER macro.
+ */
+template <typename T>
+using movable_cast_op_type
+    = conditional_t<std::is_pointer<typename std::remove_reference<T>::type>::value,
+                    typename std::add_pointer<intrinsic_t<T>>::type,
+                    conditional_t<std::is_rvalue_reference<T>::value,
+                                  typename std::add_rvalue_reference<intrinsic_t<T>>::type,
+                                  typename std::add_lvalue_reference<intrinsic_t<T>>::type>>;
+
+// Does the container have a mapped type and is it recursive?
+// Implemented by specializations below.
+template <typename Container, typename SFINAE = void>
+struct container_mapped_type_traits {
+    static constexpr bool has_mapped_type = false;
+    static constexpr bool has_recursive_mapped_type = false;
+};
+
+template <typename Container>
+struct container_mapped_type_traits<
+    Container,
+    typename std::enable_if<
+        std::is_same<typename Container::mapped_type, Container>::value>::type> {
+    static constexpr bool has_mapped_type = true;
+    static constexpr bool has_recursive_mapped_type = true;
+};
+
+template <typename Container>
+struct container_mapped_type_traits<
+    Container,
+    typename std::enable_if<
+        negation<std::is_same<typename Container::mapped_type, Container>>::value>::type> {
+    static constexpr bool has_mapped_type = true;
+    static constexpr bool has_recursive_mapped_type = false;
+};
+
+// Does the container have a value type and is it recursive?
+// Implemented by specializations below.
+template <typename Container, typename SFINAE = void>
+struct container_value_type_traits : std::false_type {
+    static constexpr bool has_value_type = false;
+    static constexpr bool has_recursive_value_type = false;
+};
+
+template <typename Container>
+struct container_value_type_traits<
+    Container,
+    typename std::enable_if<
+        std::is_same<typename Container::value_type, Container>::value>::type> {
+    static constexpr bool has_value_type = true;
+    static constexpr bool has_recursive_value_type = true;
+};
+
+template <typename Container>
+struct container_value_type_traits<
+    Container,
+    typename std::enable_if<
+        negation<std::is_same<typename Container::value_type, Container>>::value>::type> {
+    static constexpr bool has_value_type = true;
+    static constexpr bool has_recursive_value_type = false;
+};
+
+/*
+ * Tag to be used for representing the bottom of recursively defined types.
+ * Define this tag so we don't have to use void.
+ */
+struct recursive_bottom {};
+
+/*
+ * Implementation detail of `recursive_container_traits` below.
+ * `T` is the `value_type` of the container, which might need to be modified to
+ * avoid recursive types and const types.
+ */
+template <typename T, bool is_this_a_map>
+struct impl_type_to_check_recursively {
+    /*
+     * If the container is recursive, then no further recursion should be done.
+     */
+    using if_recursive = recursive_bottom;
+    /*
+     * Otherwise yield `T` unchanged.
+     */
+    using if_not_recursive = T;
+};
+
+/*
+ * For pairs - only as value type of a map -, the first type should remove the `const`.
+ * Also, if the map is recursive, then the recursive checking should consider
+ * the first type only.
+ */
+template <typename A, typename B>
+struct impl_type_to_check_recursively<std::pair<A, B>, /* is_this_a_map = */ true> {
+    using if_recursive = typename std::remove_const<A>::type;
+    using if_not_recursive = std::pair<typename std::remove_const<A>::type, B>;
+};
+
+/*
+ * Implementation of `recursive_container_traits` below.
+ */
+template <typename Container, typename SFINAE = void>
+struct impl_recursive_container_traits {
+    using type_to_check_recursively = recursive_bottom;
+};
+
+template <typename Container>
+struct impl_recursive_container_traits<
+    Container,
+    typename std::enable_if<container_value_type_traits<Container>::has_value_type>::type> {
+    static constexpr bool is_recursive
+        = container_mapped_type_traits<Container>::has_recursive_mapped_type
+          || container_value_type_traits<Container>::has_recursive_value_type;
+    /*
+     * This member dictates which type Pybind11 should check recursively in traits
+     * such as `is_move_constructible`, `is_copy_constructible`, `is_move_assignable`, ...
+     * Direct access to `value_type` should be avoided:
+     * 1. `value_type` might recursively contain the type again
+     * 2. `value_type` of STL map types is `std::pair<A const, B>`, the `const`
+     *    should be removed.
+     *
+     */
+    using type_to_check_recursively = typename std::conditional<
+        is_recursive,
+        typename impl_type_to_check_recursively<
+            typename Container::value_type,
+            container_mapped_type_traits<Container>::has_mapped_type>::if_recursive,
+        typename impl_type_to_check_recursively<
+            typename Container::value_type,
+            container_mapped_type_traits<Container>::has_mapped_type>::if_not_recursive>::type;
+};
+
+/*
+ * This trait defines the `type_to_check_recursively` which is needed to properly
+ * handle recursively defined traits such as `is_move_constructible` without going
+ * into an infinite recursion.
+ * Should be used instead of directly accessing the `value_type`.
+ * It cancels the recursion by returning the `recursive_bottom` tag.
+ *
+ * The default definition of `type_to_check_recursively` is as follows:
+ *
+ * 1. By default, it is `recursive_bottom`, so that the recursion is canceled.
+ * 2. If the type is non-recursive and defines a `value_type`, then the `value_type` is used.
+ *    If the `value_type` is a pair and a `mapped_type` is defined,
+ *    then the `const` is removed from the first type.
+ * 3. If the type is recursive and `value_type` is not a pair, then `recursive_bottom` is returned.
+ * 4. If the type is recursive and `value_type` is a pair and a `mapped_type` is defined,
+ *    then `const` is removed from the first type and the first type is returned.
+ *
+ * This behavior can be extended by the user as seen in test_stl_binders.cpp.
+ *
+ * This struct is exactly the same as impl_recursive_container_traits.
+ * The duplication achieves that user-defined specializations don't compete
+ * with internal specializations, but take precedence.
+ */
+template <typename Container, typename SFINAE = void>
+struct recursive_container_traits : impl_recursive_container_traits<Container> {};
+
+template <typename T>
+struct is_move_constructible
+    : all_of<std::is_move_constructible<T>,
+             is_move_constructible<
+                 typename recursive_container_traits<T>::type_to_check_recursively>> {};
+
+template <>
+struct is_move_constructible<recursive_bottom> : std::true_type {};
+
+// Likewise for std::pair
+// (after C++17 it is mandatory that the move constructor not exist when the two types aren't
+// themselves move constructible, but this can not be relied upon when T1 or T2 are themselves
+// containers).
+template <typename T1, typename T2>
+struct is_move_constructible<std::pair<T1, T2>>
+    : all_of<is_move_constructible<T1>, is_move_constructible<T2>> {};
+
+// std::is_copy_constructible isn't quite enough: it lets std::vector<T> (and similar) through when
+// T is non-copyable, but code containing such a copy constructor fails to actually compile.
+template <typename T>
+struct is_copy_constructible
+    : all_of<std::is_copy_constructible<T>,
+             is_copy_constructible<
+                 typename recursive_container_traits<T>::type_to_check_recursively>> {};
+
+template <>
+struct is_copy_constructible<recursive_bottom> : std::true_type {};
+
+// Likewise for std::pair
+// (after C++17 it is mandatory that the copy constructor not exist when the two types aren't
+// themselves copy constructible, but this can not be relied upon when T1 or T2 are themselves
+// containers).
+template <typename T1, typename T2>
+struct is_copy_constructible<std::pair<T1, T2>>
+    : all_of<is_copy_constructible<T1>, is_copy_constructible<T2>> {};
+
+// The same problems arise with std::is_copy_assignable, so we use the same workaround.
+template <typename T>
+struct is_copy_assignable
+    : all_of<
+          std::is_copy_assignable<T>,
+          is_copy_assignable<typename recursive_container_traits<T>::type_to_check_recursively>> {
+};
+
+template <>
+struct is_copy_assignable<recursive_bottom> : std::true_type {};
+
+template <typename T1, typename T2>
+struct is_copy_assignable<std::pair<T1, T2>>
+    : all_of<is_copy_assignable<T1>, is_copy_assignable<T2>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// polymorphic_type_hook<itype>::get(src, tinfo) determines whether the object pointed
+// to by `src` actually is an instance of some class derived from `itype`.
+// If so, it sets `tinfo` to point to the std::type_info representing that derived
+// type, and returns a pointer to the start of the most-derived object of that type
+// (in which `src` is a subobject; this will be the same address as `src` in most
+// single inheritance cases). If not, or if `src` is nullptr, it simply returns `src`
+// and leaves `tinfo` at its default value of nullptr.
+//
+// The default polymorphic_type_hook just returns src. A specialization for polymorphic
+// types determines the runtime type of the passed object and adjusts the this-pointer
+// appropriately via dynamic_cast<void*>. This is what enables a C++ Animal* to appear
+// to Python as a Dog (if Dog inherits from Animal, Animal is polymorphic, Dog is
+// registered with pybind11, and this Animal is in fact a Dog).
+//
+// You may specialize polymorphic_type_hook yourself for types that want to appear
+// polymorphic to Python but do not use C++ RTTI. (This is a not uncommon pattern
+// in performance-sensitive applications, used most notably in LLVM.)
+//
+// polymorphic_type_hook_base allows users to specialize polymorphic_type_hook with
+// std::enable_if. User provided specializations will always have higher priority than
+// the default implementation and specialization provided in polymorphic_type_hook_base.
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook_base {
+    static const void *get(const itype *src, const std::type_info *&) { return src; }
+};
+template <typename itype>
+struct polymorphic_type_hook_base<itype, detail::enable_if_t<std::is_polymorphic<itype>::value>> {
+    static const void *get(const itype *src, const std::type_info *&type) {
+        type = src ? &typeid(*src) : nullptr;
+        return dynamic_cast<const void *>(src);
+    }
+};
+template <typename itype, typename SFINAE = void>
+struct polymorphic_type_hook : public polymorphic_type_hook_base<itype> {};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+/// Generic type caster for objects stored on the heap
+template <typename type>
+class type_caster_base : public type_caster_generic {
+    using itype = intrinsic_t<type>;
+
+public:
+    static constexpr auto name = const_name<type>();
+
+    type_caster_base() : type_caster_base(typeid(type)) {}
+    explicit type_caster_base(const std::type_info &info) : type_caster_generic(info) {}
+
+    static handle cast(const itype &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(std::addressof(src), policy, parent);
+    }
+
+    static handle cast(itype &&src, return_value_policy, handle parent) {
+        return cast(std::addressof(src), return_value_policy::move, parent);
+    }
+
+    // Returns a (pointer, type_info) pair taking care of necessary type lookup for a
+    // polymorphic type (using RTTI by default, but can be overridden by specializing
+    // polymorphic_type_hook). If the instance isn't derived, returns the base version.
+    static std::pair<const void *, const type_info *> src_and_type(const itype *src) {
+        const auto &cast_type = typeid(itype);
+        const std::type_info *instance_type = nullptr;
+        const void *vsrc = polymorphic_type_hook<itype>::get(src, instance_type);
+        if (instance_type && !same_type(cast_type, *instance_type)) {
+            // This is a base pointer to a derived type. If the derived type is registered
+            // with pybind11, we want to make the full derived object available.
+            // In the typical case where itype is polymorphic, we get the correct
+            // derived pointer (which may be != base pointer) by a dynamic_cast to
+            // most derived type. If itype is not polymorphic, we won't get here
+            // except via a user-provided specialization of polymorphic_type_hook,
+            // and the user has promised that no this-pointer adjustment is
+            // required in that case, so it's OK to use static_cast.
+            if (const auto *tpi = get_type_info(*instance_type)) {
+                return {vsrc, tpi};
+            }
+        }
+        // Otherwise we have either a nullptr, an `itype` pointer, or an unknown derived pointer,
+        // so don't do a cast
+        return type_caster_generic::src_and_type(src, cast_type, instance_type);
+    }
+
+    static handle cast(const itype *src, return_value_policy policy, handle parent) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(st.first,
+                                         policy,
+                                         parent,
+                                         st.second,
+                                         make_copy_constructor(src),
+                                         make_move_constructor(src));
+    }
+
+    static handle cast_holder(const itype *src, const void *holder) {
+        auto st = src_and_type(src);
+        return type_caster_generic::cast(st.first,
+                                         return_value_policy::take_ownership,
+                                         {},
+                                         st.second,
+                                         nullptr,
+                                         nullptr,
+                                         holder);
+    }
+
+    template <typename T>
+    using cast_op_type = detail::cast_op_type<T>;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator itype *() { return (type *) value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator itype &() {
+        if (!value) {
+            throw reference_cast_error();
+        }
+        return *((itype *) value);
+    }
+
+protected:
+    using Constructor = void *(*) (const void *);
+
+    /* Only enabled when the types are {copy,move}-constructible *and* when the type
+       does not have a private operator new implementation. A comma operator is used in the
+       decltype argument to apply SFINAE to the public copy/move constructors.*/
+    template <typename T, typename = enable_if_t<is_copy_constructible<T>::value>>
+    static auto make_copy_constructor(const T *) -> decltype(new T(std::declval<const T>()),
+                                                             Constructor{}) {
+        return [](const void *arg) -> void * { return new T(*reinterpret_cast<const T *>(arg)); };
+    }
+
+    template <typename T, typename = enable_if_t<is_move_constructible<T>::value>>
+    static auto make_move_constructor(const T *) -> decltype(new T(std::declval<T &&>()),
+                                                             Constructor{}) {
+        return [](const void *arg) -> void * {
+            return new T(std::move(*const_cast<T *>(reinterpret_cast<const T *>(arg))));
+        };
+    }
+
+    static Constructor make_copy_constructor(...) { return nullptr; }
+    static Constructor make_move_constructor(...) { return nullptr; }
+};
+
+inline std::string quote_cpp_type_name(const std::string &cpp_type_name) {
+    return cpp_type_name; // No-op for now. See PR #4888
+}
+
+PYBIND11_NOINLINE std::string type_info_description(const std::type_info &ti) {
+    if (auto *type_data = get_type_info(ti)) {
+        handle th((PyObject *) type_data->type);
+        return th.attr("__module__").cast<std::string>() + '.'
+               + th.attr("__qualname__").cast<std::string>();
+    }
+    return quote_cpp_type_name(clean_type_id(ti.name()));
+}
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/common.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/common.h
new file mode 100644
index 0000000000000000000000000000000000000000..24f56d158442c2e2f4667806e91fd94b476cc96e
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/common.h
@@ -0,0 +1,9 @@
+// Copyright (c) 2023 The pybind Community.
+
+#pragma once
+
+// Common message for `static_assert()`s, which are useful to easily
+// preempt much less obvious errors.
+#define PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED                                    \
+    "Pointer types (in particular `PyObject *`) are not supported as scalar types for Eigen "     \
+    "types."
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/matrix.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/matrix.h
new file mode 100644
index 0000000000000000000000000000000000000000..5cf1f0a2a0566a8155ef7bcc10806883286615ff
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/matrix.h
@@ -0,0 +1,715 @@
+/*
+    pybind11/eigen/matrix.h: Transparent conversion for dense and sparse Eigen matrices
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <pybind11/numpy.h>
+
+#include "common.h"
+
+/* HINT: To suppress warnings originating from the Eigen headers, use -isystem.
+   See also:
+       https://stackoverflow.com/questions/2579576/i-dir-vs-isystem-dir
+       https://stackoverflow.com/questions/1741816/isystem-for-ms-visual-studio-c-compiler
+*/
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(5054) // https://github.com/pybind/pybind11/pull/3741
+//       C5054: operator '&': deprecated between enumerations of different types
+#if defined(__MINGW32__)
+PYBIND11_WARNING_DISABLE_GCC("-Wmaybe-uninitialized")
+#endif
+
+#include <Eigen/Core>
+#include <Eigen/SparseCore>
+
+PYBIND11_WARNING_POP
+
+// Eigen prior to 3.2.7 doesn't have proper move constructors--but worse, some classes get implicit
+// move constructors that break things.  We could detect this an explicitly copy, but an extra copy
+// of matrices seems highly undesirable.
+static_assert(EIGEN_VERSION_AT_LEAST(3, 2, 7),
+              "Eigen matrix support in pybind11 requires Eigen >= 3.2.7");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+// Provide a convenience alias for easier pass-by-ref usage with fully dynamic strides:
+using EigenDStride = Eigen::Stride<Eigen::Dynamic, Eigen::Dynamic>;
+template <typename MatrixType>
+using EigenDRef = Eigen::Ref<MatrixType, 0, EigenDStride>;
+template <typename MatrixType>
+using EigenDMap = Eigen::Map<MatrixType, 0, EigenDStride>;
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#if EIGEN_VERSION_AT_LEAST(3, 3, 0)
+using EigenIndex = Eigen::Index;
+template <typename Scalar, int Flags, typename StorageIndex>
+using EigenMapSparseMatrix = Eigen::Map<Eigen::SparseMatrix<Scalar, Flags, StorageIndex>>;
+#else
+using EigenIndex = EIGEN_DEFAULT_DENSE_INDEX_TYPE;
+template <typename Scalar, int Flags, typename StorageIndex>
+using EigenMapSparseMatrix = Eigen::MappedSparseMatrix<Scalar, Flags, StorageIndex>;
+#endif
+
+// Matches Eigen::Map, Eigen::Ref, blocks, etc:
+template <typename T>
+using is_eigen_dense_map = all_of<is_template_base_of<Eigen::DenseBase, T>,
+                                  std::is_base_of<Eigen::MapBase<T, Eigen::ReadOnlyAccessors>, T>>;
+template <typename T>
+using is_eigen_mutable_map = std::is_base_of<Eigen::MapBase<T, Eigen::WriteAccessors>, T>;
+template <typename T>
+using is_eigen_dense_plain
+    = all_of<negation<is_eigen_dense_map<T>>, is_template_base_of<Eigen::PlainObjectBase, T>>;
+template <typename T>
+using is_eigen_sparse = is_template_base_of<Eigen::SparseMatrixBase, T>;
+// Test for objects inheriting from EigenBase<Derived> that aren't captured by the above.  This
+// basically covers anything that can be assigned to a dense matrix but that don't have a typical
+// matrix data layout that can be copied from their .data().  For example, DiagonalMatrix and
+// SelfAdjointView fall into this category.
+template <typename T>
+using is_eigen_other
+    = all_of<is_template_base_of<Eigen::EigenBase, T>,
+             negation<any_of<is_eigen_dense_map<T>, is_eigen_dense_plain<T>, is_eigen_sparse<T>>>>;
+
+// Captures numpy/eigen conformability status (returned by EigenProps::conformable()):
+template <bool EigenRowMajor>
+struct EigenConformable {
+    bool conformable = false;
+    EigenIndex rows = 0, cols = 0;
+    EigenDStride stride{0, 0};    // Only valid if negativestrides is false!
+    bool negativestrides = false; // If true, do not use stride!
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    EigenConformable(bool fits = false) : conformable{fits} {}
+    // Matrix type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex rstride, EigenIndex cstride)
+        : conformable{true}, rows{r}, cols{c},
+          // TODO: when Eigen bug #747 is fixed, remove the tests for non-negativity.
+          // http://eigen.tuxfamily.org/bz/show_bug.cgi?id=747
+          stride{EigenRowMajor ? (rstride > 0 ? rstride : 0)
+                               : (cstride > 0 ? cstride : 0) /* outer stride */,
+                 EigenRowMajor ? (cstride > 0 ? cstride : 0)
+                               : (rstride > 0 ? rstride : 0) /* inner stride */},
+          negativestrides{rstride < 0 || cstride < 0} {}
+    // Vector type:
+    EigenConformable(EigenIndex r, EigenIndex c, EigenIndex stride)
+        : EigenConformable(r, c, r == 1 ? c * stride : stride, c == 1 ? r : r * stride) {}
+
+    template <typename props>
+    bool stride_compatible() const {
+        // To have compatible strides, we need (on both dimensions) one of fully dynamic strides,
+        // matching strides, or a dimension size of 1 (in which case the stride value is
+        // irrelevant). Alternatively, if any dimension size is 0, the strides are not relevant
+        // (and numpy ≥ 1.23 sets the strides to 0 in that case, so we need to check explicitly).
+        if (negativestrides) {
+            return false;
+        }
+        if (rows == 0 || cols == 0) {
+            return true;
+        }
+        return (props::inner_stride == Eigen::Dynamic || props::inner_stride == stride.inner()
+                || (EigenRowMajor ? cols : rows) == 1)
+               && (props::outer_stride == Eigen::Dynamic || props::outer_stride == stride.outer()
+                   || (EigenRowMajor ? rows : cols) == 1);
+    }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator bool() const { return conformable; }
+};
+
+template <typename Type>
+struct eigen_extract_stride {
+    using type = Type;
+};
+template <typename PlainObjectType, int MapOptions, typename StrideType>
+struct eigen_extract_stride<Eigen::Map<PlainObjectType, MapOptions, StrideType>> {
+    using type = StrideType;
+};
+template <typename PlainObjectType, int Options, typename StrideType>
+struct eigen_extract_stride<Eigen::Ref<PlainObjectType, Options, StrideType>> {
+    using type = StrideType;
+};
+
+// Helper struct for extracting information from an Eigen type
+template <typename Type_>
+struct EigenProps {
+    using Type = Type_;
+    using Scalar = typename Type::Scalar;
+    using StrideType = typename eigen_extract_stride<Type>::type;
+    static constexpr EigenIndex rows = Type::RowsAtCompileTime, cols = Type::ColsAtCompileTime,
+                                size = Type::SizeAtCompileTime;
+    static constexpr bool row_major = Type::IsRowMajor,
+                          vector
+                          = Type::IsVectorAtCompileTime, // At least one dimension has fixed size 1
+        fixed_rows = rows != Eigen::Dynamic, fixed_cols = cols != Eigen::Dynamic,
+                          fixed = size != Eigen::Dynamic, // Fully-fixed size
+        dynamic = !fixed_rows && !fixed_cols;             // Fully-dynamic size
+
+    template <EigenIndex i, EigenIndex ifzero>
+    using if_zero = std::integral_constant<EigenIndex, i == 0 ? ifzero : i>;
+    static constexpr EigenIndex inner_stride
+        = if_zero<StrideType::InnerStrideAtCompileTime, 1>::value,
+        outer_stride = if_zero < StrideType::OuterStrideAtCompileTime,
+        vector      ? size
+        : row_major ? cols
+                    : rows > ::value;
+    static constexpr bool dynamic_stride
+        = inner_stride == Eigen::Dynamic && outer_stride == Eigen::Dynamic;
+    static constexpr bool requires_row_major
+        = !dynamic_stride && !vector && (row_major ? inner_stride : outer_stride) == 1;
+    static constexpr bool requires_col_major
+        = !dynamic_stride && !vector && (row_major ? outer_stride : inner_stride) == 1;
+
+    // Takes an input array and determines whether we can make it fit into the Eigen type.  If
+    // the array is a vector, we attempt to fit it into either an Eigen 1xN or Nx1 vector
+    // (preferring the latter if it will fit in either, i.e. for a fully dynamic matrix type).
+    static EigenConformable<row_major> conformable(const array &a) {
+        const auto dims = a.ndim();
+        if (dims < 1 || dims > 2) {
+            return false;
+        }
+
+        if (dims == 2) { // Matrix type: require exact match (or dynamic)
+
+            EigenIndex np_rows = a.shape(0), np_cols = a.shape(1),
+                       np_rstride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar)),
+                       np_cstride = a.strides(1) / static_cast<ssize_t>(sizeof(Scalar));
+            if ((fixed_rows && np_rows != rows) || (fixed_cols && np_cols != cols)) {
+                return false;
+            }
+
+            return {np_rows, np_cols, np_rstride, np_cstride};
+        }
+
+        // Otherwise we're storing an n-vector.  Only one of the strides will be used, but
+        // whichever is used, we want the (single) numpy stride value.
+        const EigenIndex n = a.shape(0),
+                         stride = a.strides(0) / static_cast<ssize_t>(sizeof(Scalar));
+
+        if (vector) { // Eigen type is a compile-time vector
+            if (fixed && size != n) {
+                return false; // Vector size mismatch
+            }
+            return {rows == 1 ? 1 : n, cols == 1 ? 1 : n, stride};
+        }
+        if (fixed) {
+            // The type has a fixed size, but is not a vector: abort
+            return false;
+        }
+        if (fixed_cols) {
+            // Since this isn't a vector, cols must be != 1.  We allow this only if it exactly
+            // equals the number of elements (rows is Dynamic, and so 1 row is allowed).
+            if (cols != n) {
+                return false;
+            }
+            return {1, n, stride};
+        } // Otherwise it's either fully dynamic, or column dynamic; both become a column vector
+        if (fixed_rows && rows != n) {
+            return false;
+        }
+        return {n, 1, stride};
+    }
+
+    static constexpr bool show_writeable
+        = is_eigen_dense_map<Type>::value && is_eigen_mutable_map<Type>::value;
+    static constexpr bool show_order = is_eigen_dense_map<Type>::value;
+    static constexpr bool show_c_contiguous = show_order && requires_row_major;
+    static constexpr bool show_f_contiguous
+        = !show_c_contiguous && show_order && requires_col_major;
+
+    static constexpr auto descriptor
+        = const_name("numpy.ndarray[") + npy_format_descriptor<Scalar>::name + const_name("[")
+          + const_name<fixed_rows>(const_name<(size_t) rows>(), const_name("m")) + const_name(", ")
+          + const_name<fixed_cols>(const_name<(size_t) cols>(), const_name("n")) + const_name("]")
+          +
+          // For a reference type (e.g. Ref<MatrixXd>) we have other constraints that might need to
+          // be satisfied: writeable=True (for a mutable reference), and, depending on the map's
+          // stride options, possibly f_contiguous or c_contiguous.  We include them in the
+          // descriptor output to provide some hint as to why a TypeError is occurring (otherwise
+          // it can be confusing to see that a function accepts a 'numpy.ndarray[float64[3,2]]' and
+          // an error message that you *gave* a numpy.ndarray of the right type and dimensions.
+          const_name<show_writeable>(", flags.writeable", "")
+          + const_name<show_c_contiguous>(", flags.c_contiguous", "")
+          + const_name<show_f_contiguous>(", flags.f_contiguous", "") + const_name("]");
+};
+
+// Casts an Eigen type to numpy array.  If given a base, the numpy array references the src data,
+// otherwise it'll make a copy.  writeable lets you turn off the writeable flag for the array.
+template <typename props>
+handle
+eigen_array_cast(typename props::Type const &src, handle base = handle(), bool writeable = true) {
+    constexpr ssize_t elem_size = sizeof(typename props::Scalar);
+    array a;
+    if (props::vector) {
+        a = array({src.size()}, {elem_size * src.innerStride()}, src.data(), base);
+    } else {
+        a = array({src.rows(), src.cols()},
+                  {elem_size * src.rowStride(), elem_size * src.colStride()},
+                  src.data(),
+                  base);
+    }
+
+    if (!writeable) {
+        array_proxy(a.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+    }
+
+    return a.release();
+}
+
+// Takes an lvalue ref to some Eigen type and a (python) base object, creating a numpy array that
+// reference the Eigen object's data with `base` as the python-registered base class (if omitted,
+// the base will be set to None, and lifetime management is up to the caller).  The numpy array is
+// non-writeable if the given type is const.
+template <typename props, typename Type>
+handle eigen_ref_array(Type &src, handle parent = none()) {
+    // none here is to get past array's should-we-copy detection, which currently always
+    // copies when there is no base.  Setting the base to None should be harmless.
+    return eigen_array_cast<props>(src, parent, !std::is_const<Type>::value);
+}
+
+// Takes a pointer to some dense, plain Eigen type, builds a capsule around it, then returns a
+// numpy array that references the encapsulated data with a python-side reference to the capsule to
+// tie its destruction to that of any dependent python objects.  Const-ness is determined by
+// whether or not the Type of the pointer given is const.
+template <typename props, typename Type, typename = enable_if_t<is_eigen_dense_plain<Type>::value>>
+handle eigen_encapsulate(Type *src) {
+    capsule base(src, [](void *o) { delete static_cast<Type *>(o); });
+    return eigen_ref_array<props>(*src, base);
+}
+
+// Type caster for regular, dense matrix types (e.g. MatrixXd), but not maps/refs/etc. of dense
+// types.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_plain<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    static_assert(!std::is_pointer<Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using props = EigenProps<Type>;
+
+    bool load(handle src, bool convert) {
+        // If we're in no-convert mode, only load if given an array of the correct type
+        if (!convert && !isinstance<array_t<Scalar>>(src)) {
+            return false;
+        }
+
+        // Coerce into an array, but don't do type conversion yet; the copy below handles it.
+        auto buf = array::ensure(src);
+
+        if (!buf) {
+            return false;
+        }
+
+        auto dims = buf.ndim();
+        if (dims < 1 || dims > 2) {
+            return false;
+        }
+
+        auto fits = props::conformable(buf);
+        if (!fits) {
+            return false;
+        }
+
+        // Allocate the new type, then build a numpy reference into it
+        value = Type(fits.rows, fits.cols);
+        auto ref = reinterpret_steal<array>(eigen_ref_array<props>(value));
+        if (dims == 1) {
+            ref = ref.squeeze();
+        } else if (ref.ndim() == 1) {
+            buf = buf.squeeze();
+        }
+
+        int result = detail::npy_api::get().PyArray_CopyInto_(ref.ptr(), buf.ptr());
+
+        if (result < 0) { // Copy failed!
+            PyErr_Clear();
+            return false;
+        }
+
+        return true;
+    }
+
+private:
+    // Cast implementation
+    template <typename CType>
+    static handle cast_impl(CType *src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::take_ownership:
+            case return_value_policy::automatic:
+                return eigen_encapsulate<props>(src);
+            case return_value_policy::move:
+                return eigen_encapsulate<props>(new CType(std::move(*src)));
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(*src);
+            case return_value_policy::reference:
+            case return_value_policy::automatic_reference:
+                return eigen_ref_array<props>(*src);
+            case return_value_policy::reference_internal:
+                return eigen_ref_array<props>(*src, parent);
+            default:
+                throw cast_error("unhandled return_value_policy: should not happen!");
+        };
+    }
+
+public:
+    // Normal returned non-reference, non-const value:
+    static handle cast(Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // If you return a non-reference const, we mark the numpy array readonly:
+    static handle cast(const Type &&src, return_value_policy /* policy */, handle parent) {
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+    // lvalue reference return; default (automatic) becomes copy
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast_impl(&src, policy, parent);
+    }
+    // const lvalue reference return; default (automatic) becomes copy
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+    // non-const pointer return
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+    // const pointer return
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast_impl(src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type *() { return &value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type &() { return value; }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type &&() && { return std::move(value); }
+    template <typename T>
+    using cast_op_type = movable_cast_op_type<T>;
+
+private:
+    Type value;
+};
+
+// Base class for casting reference/map/block/etc. objects back to python.
+template <typename MapType>
+struct eigen_map_caster {
+    static_assert(!std::is_pointer<typename MapType::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+
+private:
+    using props = EigenProps<MapType>;
+
+public:
+    // Directly referencing a ref/map's data is a bit dangerous (whatever the map/ref points to has
+    // to stay around), but we'll allow it under the assumption that you know what you're doing
+    // (and have an appropriate keep_alive in place).  We return a numpy array pointing directly at
+    // the ref's data (The numpy array ends up read-only if the ref was to a const matrix type.)
+    // Note that this means you need to ensure you don't destroy the object in some other way (e.g.
+    // with an appropriate keep_alive, or with a reference to a statically allocated matrix).
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        switch (policy) {
+            case return_value_policy::copy:
+                return eigen_array_cast<props>(src);
+            case return_value_policy::reference_internal:
+                return eigen_array_cast<props>(src, parent, is_eigen_mutable_map<MapType>::value);
+            case return_value_policy::reference:
+            case return_value_policy::automatic:
+            case return_value_policy::automatic_reference:
+                return eigen_array_cast<props>(src, none(), is_eigen_mutable_map<MapType>::value);
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map/Ref/Block type");
+        }
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator MapType() = delete;
+    template <typename>
+    using cast_op_type = MapType;
+};
+
+// We can return any map-like object (but can only load Refs, specialized next):
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_dense_map<Type>::value>> : eigen_map_caster<Type> {};
+
+// Loader for Ref<...> arguments.  See the documentation for info on how to make this work without
+// copying (it requires some extra effort in many cases).
+template <typename PlainObjectType, typename StrideType>
+struct type_caster<
+    Eigen::Ref<PlainObjectType, 0, StrideType>,
+    enable_if_t<is_eigen_dense_map<Eigen::Ref<PlainObjectType, 0, StrideType>>::value>>
+    : public eigen_map_caster<Eigen::Ref<PlainObjectType, 0, StrideType>> {
+private:
+    using Type = Eigen::Ref<PlainObjectType, 0, StrideType>;
+    using props = EigenProps<Type>;
+    using Scalar = typename props::Scalar;
+    static_assert(!std::is_pointer<Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using MapType = Eigen::Map<PlainObjectType, 0, StrideType>;
+    using Array
+        = array_t<Scalar,
+                  array::forcecast
+                      | ((props::row_major ? props::inner_stride : props::outer_stride) == 1
+                             ? array::c_style
+                         : (props::row_major ? props::outer_stride : props::inner_stride) == 1
+                             ? array::f_style
+                             : 0)>;
+    static constexpr bool need_writeable = is_eigen_mutable_map<Type>::value;
+    // Delay construction (these have no default constructor)
+    std::unique_ptr<MapType> map;
+    std::unique_ptr<Type> ref;
+    // Our array.  When possible, this is just a numpy array pointing to the source data, but
+    // sometimes we can't avoid copying (e.g. input is not a numpy array at all, has an
+    // incompatible layout, or is an array of a type that needs to be converted).  Using a numpy
+    // temporary (rather than an Eigen temporary) saves an extra copy when we need both type
+    // conversion and storage order conversion.  (Note that we refuse to use this temporary copy
+    // when loading an argument for a Ref<M> with M non-const, i.e. a read-write reference).
+    Array copy_or_ref;
+
+public:
+    bool load(handle src, bool convert) {
+        // First check whether what we have is already an array of the right type.  If not, we
+        // can't avoid a copy (because the copy is also going to do type conversion).
+        bool need_copy = !isinstance<Array>(src);
+
+        EigenConformable<props::row_major> fits;
+        if (!need_copy) {
+            // We don't need a converting copy, but we also need to check whether the strides are
+            // compatible with the Ref's stride requirements
+            auto aref = reinterpret_borrow<Array>(src);
+
+            if (aref && (!need_writeable || aref.writeable())) {
+                fits = props::conformable(aref);
+                if (!fits) {
+                    return false; // Incompatible dimensions
+                }
+                if (!fits.template stride_compatible<props>()) {
+                    need_copy = true;
+                } else {
+                    copy_or_ref = std::move(aref);
+                }
+            } else {
+                need_copy = true;
+            }
+        }
+
+        if (need_copy) {
+            // We need to copy: If we need a mutable reference, or we're not supposed to convert
+            // (either because we're in the no-convert overload pass, or because we're explicitly
+            // instructed not to copy (via `py::arg().noconvert()`) we have to fail loading.
+            if (!convert || need_writeable) {
+                return false;
+            }
+
+            Array copy = Array::ensure(src);
+            if (!copy) {
+                return false;
+            }
+            fits = props::conformable(copy);
+            if (!fits || !fits.template stride_compatible<props>()) {
+                return false;
+            }
+            copy_or_ref = std::move(copy);
+            loader_life_support::add_patient(copy_or_ref);
+        }
+
+        ref.reset();
+        map.reset(new MapType(data(copy_or_ref),
+                              fits.rows,
+                              fits.cols,
+                              make_stride(fits.stride.outer(), fits.stride.inner())));
+        ref.reset(new Type(*map));
+
+        return true;
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type *() { return ref.get(); }
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    operator Type &() { return *ref; }
+    template <typename _T>
+    using cast_op_type = pybind11::detail::cast_op_type<_T>;
+
+private:
+    template <typename T = Type, enable_if_t<is_eigen_mutable_map<T>::value, int> = 0>
+    Scalar *data(Array &a) {
+        return a.mutable_data();
+    }
+
+    template <typename T = Type, enable_if_t<!is_eigen_mutable_map<T>::value, int> = 0>
+    const Scalar *data(Array &a) {
+        return a.data();
+    }
+
+    // Attempt to figure out a constructor of `Stride` that will work.
+    // If both strides are fixed, use a default constructor:
+    template <typename S>
+    using stride_ctor_default = bool_constant<S::InnerStrideAtCompileTime != Eigen::Dynamic
+                                              && S::OuterStrideAtCompileTime != Eigen::Dynamic
+                                              && std::is_default_constructible<S>::value>;
+    // Otherwise, if there is a two-index constructor, assume it is (outer,inner) like
+    // Eigen::Stride, and use it:
+    template <typename S>
+    using stride_ctor_dual
+        = bool_constant<!stride_ctor_default<S>::value
+                        && std::is_constructible<S, EigenIndex, EigenIndex>::value>;
+    // Otherwise, if there is a one-index constructor, and just one of the strides is dynamic, use
+    // it (passing whichever stride is dynamic).
+    template <typename S>
+    using stride_ctor_outer
+        = bool_constant<!any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value
+                        && S::OuterStrideAtCompileTime == Eigen::Dynamic
+                        && S::InnerStrideAtCompileTime != Eigen::Dynamic
+                        && std::is_constructible<S, EigenIndex>::value>;
+    template <typename S>
+    using stride_ctor_inner
+        = bool_constant<!any_of<stride_ctor_default<S>, stride_ctor_dual<S>>::value
+                        && S::InnerStrideAtCompileTime == Eigen::Dynamic
+                        && S::OuterStrideAtCompileTime != Eigen::Dynamic
+                        && std::is_constructible<S, EigenIndex>::value>;
+
+    template <typename S = StrideType, enable_if_t<stride_ctor_default<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex) {
+        return S();
+    }
+    template <typename S = StrideType, enable_if_t<stride_ctor_dual<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex inner) {
+        return S(outer, inner);
+    }
+    template <typename S = StrideType, enable_if_t<stride_ctor_outer<S>::value, int> = 0>
+    static S make_stride(EigenIndex outer, EigenIndex) {
+        return S(outer);
+    }
+    template <typename S = StrideType, enable_if_t<stride_ctor_inner<S>::value, int> = 0>
+    static S make_stride(EigenIndex, EigenIndex inner) {
+        return S(inner);
+    }
+};
+
+// type_caster for special matrix types (e.g. DiagonalMatrix), which are EigenBase, but not
+// EigenDense (i.e. they don't have a data(), at least not with the usual matrix layout).
+// load() is not supported, but we can cast them into the python domain by first copying to a
+// regular Eigen::Matrix, then casting that.
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_other<Type>::value>> {
+    static_assert(!std::is_pointer<typename Type::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+
+protected:
+    using Matrix
+        = Eigen::Matrix<typename Type::Scalar, Type::RowsAtCompileTime, Type::ColsAtCompileTime>;
+    using props = EigenProps<Matrix>;
+
+public:
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        handle h = eigen_encapsulate<props>(new Matrix(src));
+        return h;
+    }
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        return cast(*src, policy, parent);
+    }
+
+    static constexpr auto name = props::descriptor;
+
+    // Explicitly delete these: support python -> C++ conversion on these (i.e. these can be return
+    // types but not bound arguments).  We still provide them (with an explicitly delete) so that
+    // you end up here if you try anyway.
+    bool load(handle, bool) = delete;
+    operator Type() = delete;
+    template <typename>
+    using cast_op_type = Type;
+};
+
+template <typename Type>
+struct type_caster<Type, enable_if_t<is_eigen_sparse<Type>::value>> {
+    using Scalar = typename Type::Scalar;
+    static_assert(!std::is_pointer<Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using StorageIndex = remove_reference_t<decltype(*std::declval<Type>().outerIndexPtr())>;
+    using Index = typename Type::Index;
+    static constexpr bool rowMajor = Type::IsRowMajor;
+
+    bool load(handle src, bool) {
+        if (!src) {
+            return false;
+        }
+
+        auto obj = reinterpret_borrow<object>(src);
+        object sparse_module = module_::import("scipy.sparse");
+        object matrix_type = sparse_module.attr(rowMajor ? "csr_matrix" : "csc_matrix");
+
+        if (!type::handle_of(obj).is(matrix_type)) {
+            try {
+                obj = matrix_type(obj);
+            } catch (const error_already_set &) {
+                return false;
+            }
+        }
+
+        auto values = array_t<Scalar>((object) obj.attr("data"));
+        auto innerIndices = array_t<StorageIndex>((object) obj.attr("indices"));
+        auto outerIndices = array_t<StorageIndex>((object) obj.attr("indptr"));
+        auto shape = pybind11::tuple((pybind11::object) obj.attr("shape"));
+        auto nnz = obj.attr("nnz").cast<Index>();
+
+        if (!values || !innerIndices || !outerIndices) {
+            return false;
+        }
+
+        value = EigenMapSparseMatrix<Scalar,
+                                     Type::Flags &(Eigen::RowMajor | Eigen::ColMajor),
+                                     StorageIndex>(shape[0].cast<Index>(),
+                                                   shape[1].cast<Index>(),
+                                                   std::move(nnz),
+                                                   outerIndices.mutable_data(),
+                                                   innerIndices.mutable_data(),
+                                                   values.mutable_data());
+
+        return true;
+    }
+
+    static handle cast(const Type &src, return_value_policy /* policy */, handle /* parent */) {
+        const_cast<Type &>(src).makeCompressed();
+
+        object matrix_type
+            = module_::import("scipy.sparse").attr(rowMajor ? "csr_matrix" : "csc_matrix");
+
+        array data(src.nonZeros(), src.valuePtr());
+        array outerIndices((rowMajor ? src.rows() : src.cols()) + 1, src.outerIndexPtr());
+        array innerIndices(src.nonZeros(), src.innerIndexPtr());
+
+        return matrix_type(pybind11::make_tuple(
+                               std::move(data), std::move(innerIndices), std::move(outerIndices)),
+                           pybind11::make_tuple(src.rows(), src.cols()))
+            .release();
+    }
+
+    PYBIND11_TYPE_CASTER(Type,
+                         const_name<(Type::IsRowMajor) != 0>("scipy.sparse.csr_matrix[",
+                                                             "scipy.sparse.csc_matrix[")
+                             + npy_format_descriptor<Scalar>::name + const_name("]"));
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/tensor.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a9d7c2522a2cf90c8f9bfdcf9b993e08ee20cf0
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/eigen/tensor.h
@@ -0,0 +1,515 @@
+/*
+    pybind11/eigen/tensor.h: Transparent conversion for Eigen tensors
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include <pybind11/numpy.h>
+
+#include "common.h"
+
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__INTEL_COMPILER)
+static_assert(__GNUC__ > 5, "Eigen Tensor support in pybind11 requires GCC > 5.0");
+#endif
+
+// Disable warnings for Eigen
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_MSVC(4554)
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+#if defined(__MINGW32__)
+PYBIND11_WARNING_DISABLE_GCC("-Wmaybe-uninitialized")
+#endif
+
+#include <unsupported/Eigen/CXX11/Tensor>
+
+PYBIND11_WARNING_POP
+
+static_assert(EIGEN_VERSION_AT_LEAST(3, 3, 0),
+              "Eigen Tensor support in pybind11 requires Eigen >= 3.3.0");
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+inline bool is_tensor_aligned(const void *data) {
+    return (reinterpret_cast<std::size_t>(data) % EIGEN_DEFAULT_ALIGN_BYTES) == 0;
+}
+
+template <typename T>
+constexpr int compute_array_flag_from_tensor() {
+    static_assert((static_cast<int>(T::Layout) == static_cast<int>(Eigen::RowMajor))
+                      || (static_cast<int>(T::Layout) == static_cast<int>(Eigen::ColMajor)),
+                  "Layout must be row or column major");
+    return (static_cast<int>(T::Layout) == static_cast<int>(Eigen::RowMajor)) ? array::c_style
+                                                                              : array::f_style;
+}
+
+template <typename T>
+struct eigen_tensor_helper {};
+
+template <typename Scalar_, int NumIndices_, int Options_, typename IndexType>
+struct eigen_tensor_helper<Eigen::Tensor<Scalar_, NumIndices_, Options_, IndexType>> {
+    using Type = Eigen::Tensor<Scalar_, NumIndices_, Options_, IndexType>;
+    using ValidType = void;
+
+    static Eigen::DSizes<typename Type::Index, Type::NumIndices> get_shape(const Type &f) {
+        return f.dimensions();
+    }
+
+    static constexpr bool
+    is_correct_shape(const Eigen::DSizes<typename Type::Index, Type::NumIndices> & /*shape*/) {
+        return true;
+    }
+
+    template <typename T>
+    struct helper {};
+
+    template <size_t... Is>
+    struct helper<index_sequence<Is...>> {
+        static constexpr auto value = ::pybind11::detail::concat(const_name(((void) Is, "?"))...);
+    };
+
+    static constexpr auto dimensions_descriptor
+        = helper<decltype(make_index_sequence<Type::NumIndices>())>::value;
+
+    template <typename... Args>
+    static Type *alloc(Args &&...args) {
+        return new Type(std::forward<Args>(args)...);
+    }
+
+    static void free(Type *tensor) { delete tensor; }
+};
+
+template <typename Scalar_, typename std::ptrdiff_t... Indices, int Options_, typename IndexType>
+struct eigen_tensor_helper<
+    Eigen::TensorFixedSize<Scalar_, Eigen::Sizes<Indices...>, Options_, IndexType>> {
+    using Type = Eigen::TensorFixedSize<Scalar_, Eigen::Sizes<Indices...>, Options_, IndexType>;
+    using ValidType = void;
+
+    static constexpr Eigen::DSizes<typename Type::Index, Type::NumIndices>
+    get_shape(const Type & /*f*/) {
+        return get_shape();
+    }
+
+    static constexpr Eigen::DSizes<typename Type::Index, Type::NumIndices> get_shape() {
+        return Eigen::DSizes<typename Type::Index, Type::NumIndices>(Indices...);
+    }
+
+    static bool
+    is_correct_shape(const Eigen::DSizes<typename Type::Index, Type::NumIndices> &shape) {
+        return get_shape() == shape;
+    }
+
+    static constexpr auto dimensions_descriptor
+        = ::pybind11::detail::concat(const_name<Indices>()...);
+
+    template <typename... Args>
+    static Type *alloc(Args &&...args) {
+        Eigen::aligned_allocator<Type> allocator;
+        return ::new (allocator.allocate(1)) Type(std::forward<Args>(args)...);
+    }
+
+    static void free(Type *tensor) {
+        Eigen::aligned_allocator<Type> allocator;
+        tensor->~Type();
+        allocator.deallocate(tensor, 1);
+    }
+};
+
+template <typename Type, bool ShowDetails, bool NeedsWriteable = false>
+struct get_tensor_descriptor {
+    static constexpr auto details
+        = const_name<NeedsWriteable>(", flags.writeable", "")
+          + const_name<static_cast<int>(Type::Layout) == static_cast<int>(Eigen::RowMajor)>(
+              ", flags.c_contiguous", ", flags.f_contiguous");
+    static constexpr auto value
+        = const_name("numpy.ndarray[") + npy_format_descriptor<typename Type::Scalar>::name
+          + const_name("[") + eigen_tensor_helper<remove_cv_t<Type>>::dimensions_descriptor
+          + const_name("]") + const_name<ShowDetails>(details, const_name("")) + const_name("]");
+};
+
+// When EIGEN_AVOID_STL_ARRAY is defined, Eigen::DSizes<T, 0> does not have the begin() member
+// function. Falling back to a simple loop works around this issue.
+//
+// We need to disable the type-limits warning for the inner loop when size = 0.
+
+PYBIND11_WARNING_PUSH
+PYBIND11_WARNING_DISABLE_GCC("-Wtype-limits")
+
+template <typename T, int size>
+std::vector<T> convert_dsizes_to_vector(const Eigen::DSizes<T, size> &arr) {
+    std::vector<T> result(size);
+
+    for (size_t i = 0; i < size; i++) {
+        result[i] = arr[i];
+    }
+
+    return result;
+}
+
+template <typename T, int size>
+Eigen::DSizes<T, size> get_shape_for_array(const array &arr) {
+    Eigen::DSizes<T, size> result;
+    const T *shape = arr.shape();
+    for (size_t i = 0; i < size; i++) {
+        result[i] = shape[i];
+    }
+
+    return result;
+}
+
+PYBIND11_WARNING_POP
+
+template <typename Type>
+struct type_caster<Type, typename eigen_tensor_helper<Type>::ValidType> {
+    static_assert(!std::is_pointer<typename Type::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using Helper = eigen_tensor_helper<Type>;
+    static constexpr auto temp_name = get_tensor_descriptor<Type, false>::value;
+    PYBIND11_TYPE_CASTER(Type, temp_name);
+
+    bool load(handle src, bool convert) {
+        if (!convert) {
+            if (!isinstance<array>(src)) {
+                return false;
+            }
+            array temp = array::ensure(src);
+            if (!temp) {
+                return false;
+            }
+
+            if (!temp.dtype().is(dtype::of<typename Type::Scalar>())) {
+                return false;
+            }
+        }
+
+        array_t<typename Type::Scalar, compute_array_flag_from_tensor<Type>()> arr(
+            reinterpret_borrow<object>(src));
+
+        if (arr.ndim() != Type::NumIndices) {
+            return false;
+        }
+        auto shape = get_shape_for_array<typename Type::Index, Type::NumIndices>(arr);
+
+        if (!Helper::is_correct_shape(shape)) {
+            return false;
+        }
+
+#if EIGEN_VERSION_AT_LEAST(3, 4, 0)
+        auto data_pointer = arr.data();
+#else
+        // Handle Eigen bug
+        auto data_pointer = const_cast<typename Type::Scalar *>(arr.data());
+#endif
+
+        if (is_tensor_aligned(arr.data())) {
+            value = Eigen::TensorMap<const Type, Eigen::Aligned>(data_pointer, shape);
+        } else {
+            value = Eigen::TensorMap<const Type>(data_pointer, shape);
+        }
+
+        return true;
+    }
+
+    static handle cast(Type &&src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::reference
+            || policy == return_value_policy::reference_internal) {
+            pybind11_fail("Cannot use a reference return value policy for an rvalue");
+        }
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+
+    static handle cast(const Type &&src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::reference
+            || policy == return_value_policy::reference_internal) {
+            pybind11_fail("Cannot use a reference return value policy for an rvalue");
+        }
+        return cast_impl(&src, return_value_policy::move, parent);
+    }
+
+    static handle cast(Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(const Type &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(Type *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    static handle cast(const Type *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    template <typename C>
+    static handle cast_impl(C *src, return_value_policy policy, handle parent) {
+        object parent_object;
+        bool writeable = false;
+        switch (policy) {
+            case return_value_policy::move:
+                if (std::is_const<C>::value) {
+                    pybind11_fail("Cannot move from a constant reference");
+                }
+
+                src = Helper::alloc(std::move(*src));
+
+                parent_object
+                    = capsule(src, [](void *ptr) { Helper::free(reinterpret_cast<Type *>(ptr)); });
+                writeable = true;
+                break;
+
+            case return_value_policy::take_ownership:
+                if (std::is_const<C>::value) {
+                    // This cast is ugly, and might be UB in some cases, but we don't have an
+                    // alternative here as we must free that memory
+                    Helper::free(const_cast<Type *>(src));
+                    pybind11_fail("Cannot take ownership of a const reference");
+                }
+
+                parent_object
+                    = capsule(src, [](void *ptr) { Helper::free(reinterpret_cast<Type *>(ptr)); });
+                writeable = true;
+                break;
+
+            case return_value_policy::copy:
+                writeable = true;
+                break;
+
+            case return_value_policy::reference:
+                parent_object = none();
+                writeable = !std::is_const<C>::value;
+                break;
+
+            case return_value_policy::reference_internal:
+                // Default should do the right thing
+                if (!parent) {
+                    pybind11_fail("Cannot use reference internal when there is no parent");
+                }
+                parent_object = reinterpret_borrow<object>(parent);
+                writeable = !std::is_const<C>::value;
+                break;
+
+            default:
+                pybind11_fail("pybind11 bug in eigen.h, please file a bug report");
+        }
+
+        auto result = array_t<typename Type::Scalar, compute_array_flag_from_tensor<Type>()>(
+            convert_dsizes_to_vector(Helper::get_shape(*src)), src->data(), parent_object);
+
+        if (!writeable) {
+            array_proxy(result.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+        }
+
+        return result.release();
+    }
+};
+
+template <typename StoragePointerType,
+          bool needs_writeable,
+          enable_if_t<!needs_writeable, bool> = true>
+StoragePointerType get_array_data_for_type(array &arr) {
+#if EIGEN_VERSION_AT_LEAST(3, 4, 0)
+    return reinterpret_cast<StoragePointerType>(arr.data());
+#else
+    // Handle Eigen bug
+    return reinterpret_cast<StoragePointerType>(const_cast<void *>(arr.data()));
+#endif
+}
+
+template <typename StoragePointerType,
+          bool needs_writeable,
+          enable_if_t<needs_writeable, bool> = true>
+StoragePointerType get_array_data_for_type(array &arr) {
+    return reinterpret_cast<StoragePointerType>(arr.mutable_data());
+}
+
+template <typename T, typename = void>
+struct get_storage_pointer_type;
+
+template <typename MapType>
+struct get_storage_pointer_type<MapType, void_t<typename MapType::StoragePointerType>> {
+    using SPT = typename MapType::StoragePointerType;
+};
+
+template <typename MapType>
+struct get_storage_pointer_type<MapType, void_t<typename MapType::PointerArgType>> {
+    using SPT = typename MapType::PointerArgType;
+};
+
+template <typename Type, int Options>
+struct type_caster<Eigen::TensorMap<Type, Options>,
+                   typename eigen_tensor_helper<remove_cv_t<Type>>::ValidType> {
+    static_assert(!std::is_pointer<typename Type::Scalar>::value,
+                  PYBIND11_EIGEN_MESSAGE_POINTER_TYPES_ARE_NOT_SUPPORTED);
+    using MapType = Eigen::TensorMap<Type, Options>;
+    using Helper = eigen_tensor_helper<remove_cv_t<Type>>;
+
+    bool load(handle src, bool /*convert*/) {
+        // Note that we have a lot more checks here as we want to make sure to avoid copies
+        if (!isinstance<array>(src)) {
+            return false;
+        }
+        auto arr = reinterpret_borrow<array>(src);
+        if ((arr.flags() & compute_array_flag_from_tensor<Type>()) == 0) {
+            return false;
+        }
+
+        if (!arr.dtype().is(dtype::of<typename Type::Scalar>())) {
+            return false;
+        }
+
+        if (arr.ndim() != Type::NumIndices) {
+            return false;
+        }
+
+        constexpr bool is_aligned = (Options & Eigen::Aligned) != 0;
+
+        if (is_aligned && !is_tensor_aligned(arr.data())) {
+            return false;
+        }
+
+        auto shape = get_shape_for_array<typename Type::Index, Type::NumIndices>(arr);
+
+        if (!Helper::is_correct_shape(shape)) {
+            return false;
+        }
+
+        if (needs_writeable && !arr.writeable()) {
+            return false;
+        }
+
+        auto result = get_array_data_for_type<typename get_storage_pointer_type<MapType>::SPT,
+                                              needs_writeable>(arr);
+
+        value.reset(new MapType(std::move(result), std::move(shape)));
+
+        return true;
+    }
+
+    static handle cast(MapType &&src, return_value_policy policy, handle parent) {
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(const MapType &&src, return_value_policy policy, handle parent) {
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(MapType &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast_impl(&src, policy, parent);
+    }
+
+    static handle cast(const MapType &src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic
+            || policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::copy;
+        }
+        return cast(&src, policy, parent);
+    }
+
+    static handle cast(MapType *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    static handle cast(const MapType *src, return_value_policy policy, handle parent) {
+        if (policy == return_value_policy::automatic) {
+            policy = return_value_policy::take_ownership;
+        } else if (policy == return_value_policy::automatic_reference) {
+            policy = return_value_policy::reference;
+        }
+        return cast_impl(src, policy, parent);
+    }
+
+    template <typename C>
+    static handle cast_impl(C *src, return_value_policy policy, handle parent) {
+        object parent_object;
+        constexpr bool writeable = !std::is_const<C>::value;
+        switch (policy) {
+            case return_value_policy::reference:
+                parent_object = none();
+                break;
+
+            case return_value_policy::reference_internal:
+                // Default should do the right thing
+                if (!parent) {
+                    pybind11_fail("Cannot use reference internal when there is no parent");
+                }
+                parent_object = reinterpret_borrow<object>(parent);
+                break;
+
+            default:
+                // move, take_ownership don't make any sense for a ref/map:
+                pybind11_fail("Invalid return_value_policy for Eigen Map type, must be either "
+                              "reference or reference_internal");
+        }
+
+        auto result = array_t<typename Type::Scalar, compute_array_flag_from_tensor<Type>()>(
+            convert_dsizes_to_vector(Helper::get_shape(*src)),
+            src->data(),
+            std::move(parent_object));
+
+        if (!writeable) {
+            array_proxy(result.ptr())->flags &= ~detail::npy_api::NPY_ARRAY_WRITEABLE_;
+        }
+
+        return result.release();
+    }
+
+#if EIGEN_VERSION_AT_LEAST(3, 4, 0)
+
+    static constexpr bool needs_writeable = !std::is_const<typename std::remove_pointer<
+        typename get_storage_pointer_type<MapType>::SPT>::type>::value;
+#else
+    // Handle Eigen bug
+    static constexpr bool needs_writeable = !std::is_const<Type>::value;
+#endif
+
+protected:
+    // TODO: Move to std::optional once std::optional has more support
+    std::unique_ptr<MapType> value;
+
+public:
+    static constexpr auto name = get_tensor_descriptor<Type, true, needs_writeable>::value;
+    explicit operator MapType *() { return value.get(); }
+    explicit operator MapType &() { return *value; }
+    explicit operator MapType &&() && { return std::move(*value); }
+
+    template <typename T_>
+    using cast_op_type = ::pybind11::detail::movable_cast_op_type<T_>;
+};
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/numpy.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/numpy.h
new file mode 100644
index 0000000000000000000000000000000000000000..09894cf74f4aef2b91f657f84ebfb6af3e7e8d0f
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/numpy.h
@@ -0,0 +1,2139 @@
+/*
+    pybind11/numpy.h: Basic NumPy support, vectorize() wrapper
+
+    Copyright (c) 2016 Wenzel Jakob <wenzel.jakob@epfl.ch>
+
+    All rights reserved. Use of this source code is governed by a
+    BSD-style license that can be found in the LICENSE file.
+*/
+
+#pragma once
+
+#include "pybind11.h"
+#include "detail/common.h"
+#include "complex.h"
+#include "gil_safe_call_once.h"
+#include "pytypes.h"
+
+#include <algorithm>
+#include <array>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <sstream>
+#include <string>
+#include <type_traits>
+#include <typeindex>
+#include <utility>
+#include <vector>
+
+#if defined(PYBIND11_NUMPY_1_ONLY) && !defined(PYBIND11_INTERNAL_NUMPY_1_ONLY_DETECTED)
+#    error PYBIND11_NUMPY_1_ONLY must be defined before any pybind11 header is included.
+#endif
+
+/* This will be true on all flat address space platforms and allows us to reduce the
+   whole npy_intp / ssize_t / Py_intptr_t business down to just ssize_t for all size
+   and dimension types (e.g. shape, strides, indexing), instead of inflicting this
+   upon the library user.
+   Note that NumPy 2 now uses ssize_t for `npy_intp` to simplify this. */
+static_assert(sizeof(::pybind11::ssize_t) == sizeof(Py_intptr_t), "ssize_t != Py_intptr_t");
+static_assert(std::is_signed<Py_intptr_t>::value, "Py_intptr_t must be signed");
+// We now can reinterpret_cast between py::ssize_t and Py_intptr_t (MSVC + PyPy cares)
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+
+PYBIND11_WARNING_DISABLE_MSVC(4127)
+
+class dtype; // Forward declaration
+class array; // Forward declaration
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+template <>
+struct handle_type_name<dtype> {
+    static constexpr auto name = const_name("numpy.dtype");
+};
+
+template <>
+struct handle_type_name<array> {
+    static constexpr auto name = const_name("numpy.ndarray");
+};
+
+template <typename type, typename SFINAE = void>
+struct npy_format_descriptor;
+
+/* NumPy 1 proxy (always includes legacy fields) */
+struct PyArrayDescr1_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char flags;
+    int type_num;
+    int elsize;
+    int alignment;
+    char *subarray;
+    PyObject *fields;
+    PyObject *names;
+};
+
+#ifndef PYBIND11_NUMPY_1_ONLY
+struct PyArrayDescr_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char _former_flags;
+    int type_num;
+    /* Additional fields are NumPy version specific. */
+};
+#else
+/* NumPy 1.x only, we can expose all fields */
+using PyArrayDescr_Proxy = PyArrayDescr1_Proxy;
+#endif
+
+/* NumPy 2 proxy, including legacy fields */
+struct PyArrayDescr2_Proxy {
+    PyObject_HEAD
+    PyObject *typeobj;
+    char kind;
+    char type;
+    char byteorder;
+    char _former_flags;
+    int type_num;
+    std::uint64_t flags;
+    ssize_t elsize;
+    ssize_t alignment;
+    PyObject *metadata;
+    Py_hash_t hash;
+    void *reserved_null[2];
+    /* The following fields only exist if 0 <= type_num < 2056 */
+    char *subarray;
+    PyObject *fields;
+    PyObject *names;
+};
+
+struct PyArray_Proxy {
+    PyObject_HEAD
+    char *data;
+    int nd;
+    ssize_t *dimensions;
+    ssize_t *strides;
+    PyObject *base;
+    PyObject *descr;
+    int flags;
+};
+
+struct PyVoidScalarObject_Proxy {
+    PyObject_VAR_HEAD char *obval;
+    PyArrayDescr_Proxy *descr;
+    int flags;
+    PyObject *base;
+};
+
+struct numpy_type_info {
+    PyObject *dtype_ptr;
+    std::string format_str;
+};
+
+struct numpy_internals {
+    std::unordered_map<std::type_index, numpy_type_info> registered_dtypes;
+
+    numpy_type_info *get_type_info(const std::type_info &tinfo, bool throw_if_missing = true) {
+        auto it = registered_dtypes.find(std::type_index(tinfo));
+        if (it != registered_dtypes.end()) {
+            return &(it->second);
+        }
+        if (throw_if_missing) {
+            pybind11_fail(std::string("NumPy type info missing for ") + tinfo.name());
+        }
+        return nullptr;
+    }
+
+    template <typename T>
+    numpy_type_info *get_type_info(bool throw_if_missing = true) {
+        return get_type_info(typeid(typename std::remove_cv<T>::type), throw_if_missing);
+    }
+};
+
+PYBIND11_NOINLINE void load_numpy_internals(numpy_internals *&ptr) {
+    ptr = &get_or_create_shared_data<numpy_internals>("_numpy_internals");
+}
+
+inline numpy_internals &get_numpy_internals() {
+    static numpy_internals *ptr = nullptr;
+    if (!ptr) {
+        load_numpy_internals(ptr);
+    }
+    return *ptr;
+}
+
+PYBIND11_NOINLINE module_ import_numpy_core_submodule(const char *submodule_name) {
+    module_ numpy = module_::import("numpy");
+    str version_string = numpy.attr("__version__");
+
+    module_ numpy_lib = module_::import("numpy.lib");
+    object numpy_version = numpy_lib.attr("NumpyVersion")(version_string);
+    int major_version = numpy_version.attr("major").cast<int>();
+
+#ifdef PYBIND11_NUMPY_1_ONLY
+    if (major_version >= 2) {
+        throw std::runtime_error(
+            "This extension was built with PYBIND11_NUMPY_1_ONLY defined, "
+            "but NumPy 2 is used in this process. For NumPy2 compatibility, "
+            "this extension needs to be rebuilt without the PYBIND11_NUMPY_1_ONLY define.");
+    }
+#endif
+    /* `numpy.core` was renamed to `numpy._core` in NumPy 2.0 as it officially
+        became a private module. */
+    std::string numpy_core_path = major_version >= 2 ? "numpy._core" : "numpy.core";
+    return module_::import((numpy_core_path + "." + submodule_name).c_str());
+}
+
+template <typename T>
+struct same_size {
+    template <typename U>
+    using as = bool_constant<sizeof(T) == sizeof(U)>;
+};
+
+template <typename Concrete>
+constexpr int platform_lookup() {
+    return -1;
+}
+
+// Lookup a type according to its size, and return a value corresponding to the NumPy typenum.
+template <typename Concrete, typename T, typename... Ts, typename... Ints>
+constexpr int platform_lookup(int I, Ints... Is) {
+    return sizeof(Concrete) == sizeof(T) ? I : platform_lookup<Concrete, Ts...>(Is...);
+}
+
+struct npy_api {
+    enum constants {
+        NPY_ARRAY_C_CONTIGUOUS_ = 0x0001,
+        NPY_ARRAY_F_CONTIGUOUS_ = 0x0002,
+        NPY_ARRAY_OWNDATA_ = 0x0004,
+        NPY_ARRAY_FORCECAST_ = 0x0010,
+        NPY_ARRAY_ENSUREARRAY_ = 0x0040,
+        NPY_ARRAY_ALIGNED_ = 0x0100,
+        NPY_ARRAY_WRITEABLE_ = 0x0400,
+        NPY_BOOL_ = 0,
+        NPY_BYTE_,
+        NPY_UBYTE_,
+        NPY_SHORT_,
+        NPY_USHORT_,
+        NPY_INT_,
+        NPY_UINT_,
+        NPY_LONG_,
+        NPY_ULONG_,
+        NPY_LONGLONG_,
+        NPY_ULONGLONG_,
+        NPY_FLOAT_,
+        NPY_DOUBLE_,
+        NPY_LONGDOUBLE_,
+        NPY_CFLOAT_,
+        NPY_CDOUBLE_,
+        NPY_CLONGDOUBLE_,
+        NPY_OBJECT_ = 17,
+        NPY_STRING_,
+        NPY_UNICODE_,
+        NPY_VOID_,
+        // Platform-dependent normalization
+        NPY_INT8_ = NPY_BYTE_,
+        NPY_UINT8_ = NPY_UBYTE_,
+        NPY_INT16_ = NPY_SHORT_,
+        NPY_UINT16_ = NPY_USHORT_,
+        // `npy_common.h` defines the integer aliases. In order, it checks:
+        // NPY_BITSOF_LONG, NPY_BITSOF_LONGLONG, NPY_BITSOF_INT, NPY_BITSOF_SHORT, NPY_BITSOF_CHAR
+        // and assigns the alias to the first matching size, so we should check in this order.
+        NPY_INT32_
+        = platform_lookup<std::int32_t, long, int, short>(NPY_LONG_, NPY_INT_, NPY_SHORT_),
+        NPY_UINT32_ = platform_lookup<std::uint32_t, unsigned long, unsigned int, unsigned short>(
+            NPY_ULONG_, NPY_UINT_, NPY_USHORT_),
+        NPY_INT64_
+        = platform_lookup<std::int64_t, long, long long, int>(NPY_LONG_, NPY_LONGLONG_, NPY_INT_),
+        NPY_UINT64_
+        = platform_lookup<std::uint64_t, unsigned long, unsigned long long, unsigned int>(
+            NPY_ULONG_, NPY_ULONGLONG_, NPY_UINT_),
+    };
+
+    unsigned int PyArray_RUNTIME_VERSION_;
+
+    struct PyArray_Dims {
+        Py_intptr_t *ptr;
+        int len;
+    };
+
+    static npy_api &get() {
+        PYBIND11_CONSTINIT static gil_safe_call_once_and_store<npy_api> storage;
+        return storage.call_once_and_store_result(lookup).get_stored();
+    }
+
+    bool PyArray_Check_(PyObject *obj) const {
+        return PyObject_TypeCheck(obj, PyArray_Type_) != 0;
+    }
+    bool PyArrayDescr_Check_(PyObject *obj) const {
+        return PyObject_TypeCheck(obj, PyArrayDescr_Type_) != 0;
+    }
+
+    unsigned int (*PyArray_GetNDArrayCFeatureVersion_)();
+    PyObject *(*PyArray_DescrFromType_)(int);
+    PyObject *(*PyArray_NewFromDescr_)(PyTypeObject *,
+                                       PyObject *,
+                                       int,
+                                       Py_intptr_t const *,
+                                       Py_intptr_t const *,
+                                       void *,
+                                       int,
+                                       PyObject *);
+    // Unused. Not removed because that affects ABI of the class.
+    PyObject *(*PyArray_DescrNewFromType_)(int);
+    int (*PyArray_CopyInto_)(PyObject *, PyObject *);
+    PyObject *(*PyArray_NewCopy_)(PyObject *, int);
+    PyTypeObject *PyArray_Type_;
+    PyTypeObject *PyVoidArrType_Type_;
+    PyTypeObject *PyArrayDescr_Type_;
+    PyObject *(*PyArray_DescrFromScalar_)(PyObject *);
+    PyObject *(*PyArray_FromAny_)(PyObject *, PyObject *, int, int, int, PyObject *);
+    int (*PyArray_DescrConverter_)(PyObject *, PyObject **);
+    bool (*PyArray_EquivTypes_)(PyObject *, PyObject *);
+#ifdef PYBIND11_NUMPY_1_ONLY
+    int (*PyArray_GetArrayParamsFromObject_)(PyObject *,
+                                             PyObject *,
+                                             unsigned char,
+                                             PyObject **,
+                                             int *,
+                                             Py_intptr_t *,
+                                             PyObject **,
+                                             PyObject *);
+#endif
+    PyObject *(*PyArray_Squeeze_)(PyObject *);
+    // Unused. Not removed because that affects ABI of the class.
+    int (*PyArray_SetBaseObject_)(PyObject *, PyObject *);
+    PyObject *(*PyArray_Resize_)(PyObject *, PyArray_Dims *, int, int);
+    PyObject *(*PyArray_Newshape_)(PyObject *, PyArray_Dims *, int);
+    PyObject *(*PyArray_View_)(PyObject *, PyObject *, PyObject *);
+
+private:
+    enum functions {
+        API_PyArray_GetNDArrayCFeatureVersion = 211,
+        API_PyArray_Type = 2,
+        API_PyArrayDescr_Type = 3,
+        API_PyVoidArrType_Type = 39,
+        API_PyArray_DescrFromType = 45,
+        API_PyArray_DescrFromScalar = 57,
+        API_PyArray_FromAny = 69,
+        API_PyArray_Resize = 80,
+        // CopyInto was slot 82 and 50 was effectively an alias. NumPy 2 removed 82.
+        API_PyArray_CopyInto = 50,
+        API_PyArray_NewCopy = 85,
+        API_PyArray_NewFromDescr = 94,
+        API_PyArray_DescrNewFromType = 96,
+        API_PyArray_Newshape = 135,
+        API_PyArray_Squeeze = 136,
+        API_PyArray_View = 137,
+        API_PyArray_DescrConverter = 174,
+        API_PyArray_EquivTypes = 182,
+#ifdef PYBIND11_NUMPY_1_ONLY
+        API_PyArray_GetArrayParamsFromObject = 278,
+#endif
+        API_PyArray_SetBaseObject = 282
+    };
+
+    static npy_api lookup() {
+        module_ m = detail::import_numpy_core_submodule("multiarray");
+        auto c = m.attr("_ARRAY_API");
+        void **api_ptr = (void **) PyCapsule_GetPointer(c.ptr(), nullptr);
+        if (api_ptr == nullptr) {
+            raise_from(PyExc_SystemError, "FAILURE obtaining numpy _ARRAY_API pointer.");
+            throw error_already_set();
+        }
+        npy_api api;
+#define DECL_NPY_API(Func) api.Func##_ = (decltype(api.Func##_)) api_ptr[API_##Func];
+        DECL_NPY_API(PyArray_GetNDArrayCFeatureVersion);
+        api.PyArray_RUNTIME_VERSION_ = api.PyArray_GetNDArrayCFeatureVersion_();
+        if (api.PyArray_RUNTIME_VERSION_ < 0x7) {
+            pybind11_fail("pybind11 numpy support requires numpy >= 1.7.0");
+        }
+        DECL_NPY_API(PyArray_Type);
+        DECL_NPY_API(PyVoidArrType_Type);
+        DECL_NPY_API(PyArrayDescr_Type);
+        DECL_NPY_API(PyArray_DescrFromType);
+        DECL_NPY_API(PyArray_DescrFromScalar);
+        DECL_NPY_API(PyArray_FromAny);
+        DECL_NPY_API(PyArray_Resize);
+        DECL_NPY_API(PyArray_CopyInto);
+        DECL_NPY_API(PyArray_NewCopy);
+        DECL_NPY_API(PyArray_NewFromDescr);
+        DECL_NPY_API(PyArray_DescrNewFromType);
+        DECL_NPY_API(PyArray_Newshape);
+        DECL_NPY_API(PyArray_Squeeze);
+        DECL_NPY_API(PyArray_View);
+        DECL_NPY_API(PyArray_DescrConverter);
+        DECL_NPY_API(PyArray_EquivTypes);
+#ifdef PYBIND11_NUMPY_1_ONLY
+        DECL_NPY_API(PyArray_GetArrayParamsFromObject);
+#endif
+        DECL_NPY_API(PyArray_SetBaseObject);
+
+#undef DECL_NPY_API
+        return api;
+    }
+};
+
+inline PyArray_Proxy *array_proxy(void *ptr) { return reinterpret_cast<PyArray_Proxy *>(ptr); }
+
+inline const PyArray_Proxy *array_proxy(const void *ptr) {
+    return reinterpret_cast<const PyArray_Proxy *>(ptr);
+}
+
+inline PyArrayDescr_Proxy *array_descriptor_proxy(PyObject *ptr) {
+    return reinterpret_cast<PyArrayDescr_Proxy *>(ptr);
+}
+
+inline const PyArrayDescr_Proxy *array_descriptor_proxy(const PyObject *ptr) {
+    return reinterpret_cast<const PyArrayDescr_Proxy *>(ptr);
+}
+
+inline const PyArrayDescr1_Proxy *array_descriptor1_proxy(const PyObject *ptr) {
+    return reinterpret_cast<const PyArrayDescr1_Proxy *>(ptr);
+}
+
+inline const PyArrayDescr2_Proxy *array_descriptor2_proxy(const PyObject *ptr) {
+    return reinterpret_cast<const PyArrayDescr2_Proxy *>(ptr);
+}
+
+inline bool check_flags(const void *ptr, int flag) {
+    return (flag == (array_proxy(ptr)->flags & flag));
+}
+
+template <typename T>
+struct is_std_array : std::false_type {};
+template <typename T, size_t N>
+struct is_std_array<std::array<T, N>> : std::true_type {};
+template <typename T>
+struct is_complex : std::false_type {};
+template <typename T>
+struct is_complex<std::complex<T>> : std::true_type {};
+
+template <typename T>
+struct array_info_scalar {
+    using type = T;
+    static constexpr bool is_array = false;
+    static constexpr bool is_empty = false;
+    static constexpr auto extents = const_name("");
+    static void append_extents(list & /* shape */) {}
+};
+// Computes underlying type and a comma-separated list of extents for array
+// types (any mix of std::array and built-in arrays). An array of char is
+// treated as scalar because it gets special handling.
+template <typename T>
+struct array_info : array_info_scalar<T> {};
+template <typename T, size_t N>
+struct array_info<std::array<T, N>> {
+    using type = typename array_info<T>::type;
+    static constexpr bool is_array = true;
+    static constexpr bool is_empty = (N == 0) || array_info<T>::is_empty;
+    static constexpr size_t extent = N;
+
+    // appends the extents to shape
+    static void append_extents(list &shape) {
+        shape.append(N);
+        array_info<T>::append_extents(shape);
+    }
+
+    static constexpr auto extents = const_name<array_info<T>::is_array>(
+        ::pybind11::detail::concat(const_name<N>(), array_info<T>::extents), const_name<N>());
+};
+// For numpy we have special handling for arrays of characters, so we don't include
+// the size in the array extents.
+template <size_t N>
+struct array_info<char[N]> : array_info_scalar<char[N]> {};
+template <size_t N>
+struct array_info<std::array<char, N>> : array_info_scalar<std::array<char, N>> {};
+template <typename T, size_t N>
+struct array_info<T[N]> : array_info<std::array<T, N>> {};
+template <typename T>
+using remove_all_extents_t = typename array_info<T>::type;
+
+template <typename T>
+using is_pod_struct
+    = all_of<std::is_standard_layout<T>, // since we're accessing directly in memory
+                                         // we need a standard layout type
+#if defined(__GLIBCXX__)                                                                          \
+    && (__GLIBCXX__ < 20150422 || __GLIBCXX__ == 20150426 || __GLIBCXX__ == 20150623              \
+        || __GLIBCXX__ == 20150626 || __GLIBCXX__ == 20160803)
+             // libstdc++ < 5 (including versions 4.8.5, 4.9.3 and 4.9.4 which were released after
+             // 5) don't implement is_trivially_copyable, so approximate it
+             std::is_trivially_destructible<T>,
+             satisfies_any_of<T, std::has_trivial_copy_constructor, std::has_trivial_copy_assign>,
+#else
+             std::is_trivially_copyable<T>,
+#endif
+             satisfies_none_of<T,
+                               std::is_reference,
+                               std::is_array,
+                               is_std_array,
+                               std::is_arithmetic,
+                               is_complex,
+                               std::is_enum>>;
+
+// Replacement for std::is_pod (deprecated in C++20)
+template <typename T>
+using is_pod = all_of<std::is_standard_layout<T>, std::is_trivial<T>>;
+
+template <ssize_t Dim = 0, typename Strides>
+ssize_t byte_offset_unsafe(const Strides &) {
+    return 0;
+}
+template <ssize_t Dim = 0, typename Strides, typename... Ix>
+ssize_t byte_offset_unsafe(const Strides &strides, ssize_t i, Ix... index) {
+    return i * strides[Dim] + byte_offset_unsafe<Dim + 1>(strides, index...);
+}
+
+/**
+ * Proxy class providing unsafe, unchecked const access to array data.  This is constructed through
+ * the `unchecked<T, N>()` method of `array` or the `unchecked<N>()` method of `array_t<T>`. `Dims`
+ * will be -1 for dimensions determined at runtime.
+ */
+template <typename T, ssize_t Dims>
+class unchecked_reference {
+protected:
+    static constexpr bool Dynamic = Dims < 0;
+    const unsigned char *data_;
+    // Storing the shape & strides in local variables (i.e. these arrays) allows the compiler to
+    // make large performance gains on big, nested loops, but requires compile-time dimensions
+    conditional_t<Dynamic, const ssize_t *, std::array<ssize_t, (size_t) Dims>> shape_, strides_;
+    const ssize_t dims_;
+
+    friend class pybind11::array;
+    // Constructor for compile-time dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data,
+                        const ssize_t *shape,
+                        const ssize_t *strides,
+                        enable_if_t<!Dyn, ssize_t>)
+        : data_{reinterpret_cast<const unsigned char *>(data)}, dims_{Dims} {
+        for (size_t i = 0; i < (size_t) dims_; i++) {
+            shape_[i] = shape[i];
+            strides_[i] = strides[i];
+        }
+    }
+    // Constructor for runtime dimensions:
+    template <bool Dyn = Dynamic>
+    unchecked_reference(const void *data,
+                        const ssize_t *shape,
+                        const ssize_t *strides,
+                        enable_if_t<Dyn, ssize_t> dims)
+        : data_{reinterpret_cast<const unsigned char *>(data)}, shape_{shape}, strides_{strides},
+          dims_{dims} {}
+
+public:
+    /**
+     * Unchecked const reference access to data at the given indices.  For a compile-time known
+     * number of dimensions, this requires the correct number of arguments; for run-time
+     * dimensionality, this is not checked (and so is up to the caller to use safely).
+     */
+    template <typename... Ix>
+    const T &operator()(Ix... index) const {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                      "Invalid number of indices for unchecked array reference");
+        return *reinterpret_cast<const T *>(data_
+                                            + byte_offset_unsafe(strides_, ssize_t(index)...));
+    }
+    /**
+     * Unchecked const reference access to data; this operator only participates if the reference
+     * is to a 1-dimensional array.  When present, this is exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    const T &operator[](ssize_t index) const {
+        return operator()(index);
+    }
+
+    /// Pointer access to the data at the given indices.
+    template <typename... Ix>
+    const T *data(Ix... ix) const {
+        return &operator()(ssize_t(ix)...);
+    }
+
+    /// Returns the item size, i.e. sizeof(T)
+    constexpr static ssize_t itemsize() { return sizeof(T); }
+
+    /// Returns the shape (i.e. size) of dimension `dim`
+    ssize_t shape(ssize_t dim) const { return shape_[(size_t) dim]; }
+
+    /// Returns the number of dimensions of the array
+    ssize_t ndim() const { return dims_; }
+
+    /// Returns the total number of elements in the referenced array, i.e. the product of the
+    /// shapes
+    template <bool Dyn = Dynamic>
+    enable_if_t<!Dyn, ssize_t> size() const {
+        return std::accumulate(
+            shape_.begin(), shape_.end(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+    template <bool Dyn = Dynamic>
+    enable_if_t<Dyn, ssize_t> size() const {
+        return std::accumulate(shape_, shape_ + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Returns the total number of bytes used by the referenced data.  Note that the actual span
+    /// in memory may be larger if the referenced array has non-contiguous strides (e.g. for a
+    /// slice).
+    ssize_t nbytes() const { return size() * itemsize(); }
+};
+
+template <typename T, ssize_t Dims>
+class unchecked_mutable_reference : public unchecked_reference<T, Dims> {
+    friend class pybind11::array;
+    using ConstBase = unchecked_reference<T, Dims>;
+    using ConstBase::ConstBase;
+    using ConstBase::Dynamic;
+
+public:
+    // Bring in const-qualified versions from base class
+    using ConstBase::operator();
+    using ConstBase::operator[];
+
+    /// Mutable, unchecked access to data at the given indices.
+    template <typename... Ix>
+    T &operator()(Ix... index) {
+        static_assert(ssize_t{sizeof...(Ix)} == Dims || Dynamic,
+                      "Invalid number of indices for unchecked array reference");
+        return const_cast<T &>(ConstBase::operator()(index...));
+    }
+    /**
+     * Mutable, unchecked access data at the given index; this operator only participates if the
+     * reference is to a 1-dimensional array (or has runtime dimensions).  When present, this is
+     * exactly equivalent to `obj(index)`.
+     */
+    template <ssize_t D = Dims, typename = enable_if_t<D == 1 || Dynamic>>
+    T &operator[](ssize_t index) {
+        return operator()(index);
+    }
+
+    /// Mutable pointer access to the data at the given indices.
+    template <typename... Ix>
+    T *mutable_data(Ix... ix) {
+        return &operator()(ssize_t(ix)...);
+    }
+};
+
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_reference<T, Dim>> {
+    static_assert(Dim == 0 && Dim > 0 /* always fail */,
+                  "unchecked array proxy object is not castable");
+};
+template <typename T, ssize_t Dim>
+struct type_caster<unchecked_mutable_reference<T, Dim>>
+    : type_caster<unchecked_reference<T, Dim>> {};
+
+PYBIND11_NAMESPACE_END(detail)
+
+class dtype : public object {
+public:
+    PYBIND11_OBJECT_DEFAULT(dtype, object, detail::npy_api::get().PyArrayDescr_Check_)
+
+    explicit dtype(const buffer_info &info) {
+        dtype descr(_dtype_from_pep3118()(pybind11::str(info.format)));
+        // If info.itemsize == 0, use the value calculated from the format string
+        m_ptr = descr.strip_padding(info.itemsize != 0 ? info.itemsize : descr.itemsize())
+                    .release()
+                    .ptr();
+    }
+
+    explicit dtype(const pybind11::str &format) : dtype(from_args(format)) {}
+
+    explicit dtype(const std::string &format) : dtype(pybind11::str(format)) {}
+
+    explicit dtype(const char *format) : dtype(pybind11::str(format)) {}
+
+    dtype(list names, list formats, list offsets, ssize_t itemsize) {
+        dict args;
+        args["names"] = std::move(names);
+        args["formats"] = std::move(formats);
+        args["offsets"] = std::move(offsets);
+        args["itemsize"] = pybind11::int_(itemsize);
+        m_ptr = from_args(args).release().ptr();
+    }
+
+    /// Return dtype for the given typenum (one of the NPY_TYPES).
+    /// https://numpy.org/devdocs/reference/c-api/array.html#c.PyArray_DescrFromType
+    explicit dtype(int typenum)
+        : object(detail::npy_api::get().PyArray_DescrFromType_(typenum), stolen_t{}) {
+        if (m_ptr == nullptr) {
+            throw error_already_set();
+        }
+    }
+
+    /// This is essentially the same as calling numpy.dtype(args) in Python.
+    static dtype from_args(const object &args) {
+        PyObject *ptr = nullptr;
+        if ((detail::npy_api::get().PyArray_DescrConverter_(args.ptr(), &ptr) == 0) || !ptr) {
+            throw error_already_set();
+        }
+        return reinterpret_steal<dtype>(ptr);
+    }
+
+    /// Return dtype associated with a C++ type.
+    template <typename T>
+    static dtype of() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::dtype();
+    }
+
+    /// Size of the data type in bytes.
+#ifdef PYBIND11_NUMPY_1_ONLY
+    ssize_t itemsize() const { return detail::array_descriptor_proxy(m_ptr)->elsize; }
+#else
+    ssize_t itemsize() const {
+        if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) {
+            return detail::array_descriptor1_proxy(m_ptr)->elsize;
+        }
+        return detail::array_descriptor2_proxy(m_ptr)->elsize;
+    }
+#endif
+
+    /// Returns true for structured data types.
+#ifdef PYBIND11_NUMPY_1_ONLY
+    bool has_fields() const { return detail::array_descriptor_proxy(m_ptr)->names != nullptr; }
+#else
+    bool has_fields() const {
+        if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) {
+            return detail::array_descriptor1_proxy(m_ptr)->names != nullptr;
+        }
+        const auto *proxy = detail::array_descriptor2_proxy(m_ptr);
+        if (proxy->type_num < 0 || proxy->type_num >= 2056) {
+            return false;
+        }
+        return proxy->names != nullptr;
+    }
+#endif
+
+    /// Single-character code for dtype's kind.
+    /// For example, floating point types are 'f' and integral types are 'i'.
+    char kind() const { return detail::array_descriptor_proxy(m_ptr)->kind; }
+
+    /// Single-character for dtype's type.
+    /// For example, ``float`` is 'f', ``double`` 'd', ``int`` 'i', and ``long`` 'l'.
+    char char_() const {
+        // Note: The signature, `dtype::char_` follows the naming of NumPy's
+        // public Python API (i.e., ``dtype.char``), rather than its internal
+        // C API (``PyArray_Descr::type``).
+        return detail::array_descriptor_proxy(m_ptr)->type;
+    }
+
+    /// type number of dtype.
+    int num() const {
+        // Note: The signature, `dtype::num` follows the naming of NumPy's public
+        // Python API (i.e., ``dtype.num``), rather than its internal
+        // C API (``PyArray_Descr::type_num``).
+        return detail::array_descriptor_proxy(m_ptr)->type_num;
+    }
+
+    /// Single character for byteorder
+    char byteorder() const { return detail::array_descriptor_proxy(m_ptr)->byteorder; }
+
+/// Alignment of the data type
+#ifdef PYBIND11_NUMPY_1_ONLY
+    int alignment() const { return detail::array_descriptor_proxy(m_ptr)->alignment; }
+#else
+    ssize_t alignment() const {
+        if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) {
+            return detail::array_descriptor1_proxy(m_ptr)->alignment;
+        }
+        return detail::array_descriptor2_proxy(m_ptr)->alignment;
+    }
+#endif
+
+/// Flags for the array descriptor
+#ifdef PYBIND11_NUMPY_1_ONLY
+    char flags() const { return detail::array_descriptor_proxy(m_ptr)->flags; }
+#else
+    std::uint64_t flags() const {
+        if (detail::npy_api::get().PyArray_RUNTIME_VERSION_ < 0x12) {
+            return (unsigned char) detail::array_descriptor1_proxy(m_ptr)->flags;
+        }
+        return detail::array_descriptor2_proxy(m_ptr)->flags;
+    }
+#endif
+
+private:
+    static object &_dtype_from_pep3118() {
+        PYBIND11_CONSTINIT static gil_safe_call_once_and_store<object> storage;
+        return storage
+            .call_once_and_store_result([]() {
+                return detail::import_numpy_core_submodule("_internal")
+                    .attr("_dtype_from_pep3118");
+            })
+            .get_stored();
+    }
+
+    dtype strip_padding(ssize_t itemsize) {
+        // Recursively strip all void fields with empty names that are generated for
+        // padding fields (as of NumPy v1.11).
+        if (!has_fields()) {
+            return *this;
+        }
+
+        struct field_descr {
+            pybind11::str name;
+            object format;
+            pybind11::int_ offset;
+            field_descr(pybind11::str &&name, object &&format, pybind11::int_ &&offset)
+                : name{std::move(name)}, format{std::move(format)}, offset{std::move(offset)} {};
+        };
+        auto field_dict = attr("fields").cast<dict>();
+        std::vector<field_descr> field_descriptors;
+        field_descriptors.reserve(field_dict.size());
+
+        for (auto field : field_dict.attr("items")()) {
+            auto spec = field.cast<tuple>();
+            auto name = spec[0].cast<pybind11::str>();
+            auto spec_fo = spec[1].cast<tuple>();
+            auto format = spec_fo[0].cast<dtype>();
+            auto offset = spec_fo[1].cast<pybind11::int_>();
+            if ((len(name) == 0u) && format.kind() == 'V') {
+                continue;
+            }
+            field_descriptors.emplace_back(
+                std::move(name), format.strip_padding(format.itemsize()), std::move(offset));
+        }
+
+        std::sort(field_descriptors.begin(),
+                  field_descriptors.end(),
+                  [](const field_descr &a, const field_descr &b) {
+                      return a.offset.cast<int>() < b.offset.cast<int>();
+                  });
+
+        list names, formats, offsets;
+        for (auto &descr : field_descriptors) {
+            names.append(std::move(descr.name));
+            formats.append(std::move(descr.format));
+            offsets.append(std::move(descr.offset));
+        }
+        return dtype(std::move(names), std::move(formats), std::move(offsets), itemsize);
+    }
+};
+
+class array : public buffer {
+public:
+    PYBIND11_OBJECT_CVT(array, buffer, detail::npy_api::get().PyArray_Check_, raw_array)
+
+    enum {
+        c_style = detail::npy_api::NPY_ARRAY_C_CONTIGUOUS_,
+        f_style = detail::npy_api::NPY_ARRAY_F_CONTIGUOUS_,
+        forcecast = detail::npy_api::NPY_ARRAY_FORCECAST_
+    };
+
+    array() : array(0, static_cast<const double *>(nullptr)) {}
+
+    using ShapeContainer = detail::any_container<ssize_t>;
+    using StridesContainer = detail::any_container<ssize_t>;
+
+    // Constructs an array taking shape/strides from arbitrary container types
+    array(const pybind11::dtype &dt,
+          ShapeContainer shape,
+          StridesContainer strides,
+          const void *ptr = nullptr,
+          handle base = handle()) {
+
+        if (strides->empty()) {
+            *strides = detail::c_strides(*shape, dt.itemsize());
+        }
+
+        auto ndim = shape->size();
+        if (ndim != strides->size()) {
+            pybind11_fail("NumPy: shape ndim doesn't match strides ndim");
+        }
+        auto descr = dt;
+
+        int flags = 0;
+        if (base && ptr) {
+            if (isinstance<array>(base)) {
+                /* Copy flags from base (except ownership bit) */
+                flags = reinterpret_borrow<array>(base).flags()
+                        & ~detail::npy_api::NPY_ARRAY_OWNDATA_;
+            } else {
+                /* Writable by default, easy to downgrade later on if needed */
+                flags = detail::npy_api::NPY_ARRAY_WRITEABLE_;
+            }
+        }
+
+        auto &api = detail::npy_api::get();
+        auto tmp = reinterpret_steal<object>(api.PyArray_NewFromDescr_(
+            api.PyArray_Type_,
+            descr.release().ptr(),
+            (int) ndim,
+            // Use reinterpret_cast for PyPy on Windows (remove if fixed, checked on 7.3.1)
+            reinterpret_cast<Py_intptr_t *>(shape->data()),
+            reinterpret_cast<Py_intptr_t *>(strides->data()),
+            const_cast<void *>(ptr),
+            flags,
+            nullptr));
+        if (!tmp) {
+            throw error_already_set();
+        }
+        if (ptr) {
+            if (base) {
+                api.PyArray_SetBaseObject_(tmp.ptr(), base.inc_ref().ptr());
+            } else {
+                tmp = reinterpret_steal<object>(
+                    api.PyArray_NewCopy_(tmp.ptr(), -1 /* any order */));
+            }
+        }
+        m_ptr = tmp.release().ptr();
+    }
+
+    array(const pybind11::dtype &dt,
+          ShapeContainer shape,
+          const void *ptr = nullptr,
+          handle base = handle())
+        : array(dt, std::move(shape), {}, ptr, base) {}
+
+    template <typename T,
+              typename
+              = detail::enable_if_t<std::is_integral<T>::value && !std::is_same<bool, T>::value>>
+    array(const pybind11::dtype &dt, T count, const void *ptr = nullptr, handle base = handle())
+        : array(dt, {{count}}, ptr, base) {}
+
+    template <typename T>
+    array(ShapeContainer shape, StridesContainer strides, const T *ptr, handle base = handle())
+        : array(pybind11::dtype::of<T>(),
+                std::move(shape),
+                std::move(strides),
+                reinterpret_cast<const void *>(ptr),
+                base) {}
+
+    template <typename T>
+    array(ShapeContainer shape, const T *ptr, handle base = handle())
+        : array(std::move(shape), {}, ptr, base) {}
+
+    template <typename T>
+    explicit array(ssize_t count, const T *ptr, handle base = handle())
+        : array({count}, {}, ptr, base) {}
+
+    explicit array(const buffer_info &info, handle base = handle())
+        : array(pybind11::dtype(info), info.shape, info.strides, info.ptr, base) {}
+
+    /// Array descriptor (dtype)
+    pybind11::dtype dtype() const {
+        return reinterpret_borrow<pybind11::dtype>(detail::array_proxy(m_ptr)->descr);
+    }
+
+    /// Total number of elements
+    ssize_t size() const {
+        return std::accumulate(shape(), shape() + ndim(), (ssize_t) 1, std::multiplies<ssize_t>());
+    }
+
+    /// Byte size of a single element
+    ssize_t itemsize() const { return dtype().itemsize(); }
+
+    /// Total number of bytes
+    ssize_t nbytes() const { return size() * itemsize(); }
+
+    /// Number of dimensions
+    ssize_t ndim() const { return detail::array_proxy(m_ptr)->nd; }
+
+    /// Base object
+    object base() const { return reinterpret_borrow<object>(detail::array_proxy(m_ptr)->base); }
+
+    /// Dimensions of the array
+    const ssize_t *shape() const { return detail::array_proxy(m_ptr)->dimensions; }
+
+    /// Dimension along a given axis
+    ssize_t shape(ssize_t dim) const {
+        if (dim >= ndim()) {
+            fail_dim_check(dim, "invalid axis");
+        }
+        return shape()[dim];
+    }
+
+    /// Strides of the array
+    const ssize_t *strides() const { return detail::array_proxy(m_ptr)->strides; }
+
+    /// Stride along a given axis
+    ssize_t strides(ssize_t dim) const {
+        if (dim >= ndim()) {
+            fail_dim_check(dim, "invalid axis");
+        }
+        return strides()[dim];
+    }
+
+    /// Return the NumPy array flags
+    int flags() const { return detail::array_proxy(m_ptr)->flags; }
+
+    /// If set, the array is writeable (otherwise the buffer is read-only)
+    bool writeable() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_WRITEABLE_);
+    }
+
+    /// If set, the array owns the data (will be freed when the array is deleted)
+    bool owndata() const {
+        return detail::check_flags(m_ptr, detail::npy_api::NPY_ARRAY_OWNDATA_);
+    }
+
+    /// Pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    template <typename... Ix>
+    const void *data(Ix... index) const {
+        return static_cast<const void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Mutable pointer to the contained data. If index is not provided, points to the
+    /// beginning of the buffer. May throw if the index would lead to out of bounds access.
+    /// May throw if the array is not writeable.
+    template <typename... Ix>
+    void *mutable_data(Ix... index) {
+        check_writeable();
+        return static_cast<void *>(detail::array_proxy(m_ptr)->data + offset_at(index...));
+    }
+
+    /// Byte offset from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template <typename... Ix>
+    ssize_t offset_at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) > ndim()) {
+            fail_dim_check(sizeof...(index), "too many indices for an array");
+        }
+        return byte_offset(ssize_t(index)...);
+    }
+
+    ssize_t offset_at() const { return 0; }
+
+    /// Item count from beginning of the array to a given index (full or partial).
+    /// May throw if the index would lead to out of bounds access.
+    template <typename... Ix>
+    ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1>
+    detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        if (Dims >= 0 && ndim() != Dims) {
+            throw std::domain_error("array has incorrect number of dimensions: "
+                                    + std::to_string(ndim()) + "; expected "
+                                    + std::to_string(Dims));
+        }
+        return detail::unchecked_mutable_reference<T, Dims>(
+            mutable_data(), shape(), strides(), ndim());
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed
+     * or reshaped for the duration of the returned object, and the caller must take care not to
+     * access invalid dimensions or dimension indices.
+     */
+    template <typename T, ssize_t Dims = -1>
+    detail::unchecked_reference<T, Dims> unchecked() const & {
+        if (Dims >= 0 && ndim() != Dims) {
+            throw std::domain_error("array has incorrect number of dimensions: "
+                                    + std::to_string(ndim()) + "; expected "
+                                    + std::to_string(Dims));
+        }
+        return detail::unchecked_reference<T, Dims>(data(), shape(), strides(), ndim());
+    }
+
+    /// Return a new view with all of the dimensions of length 1 removed
+    array squeeze() {
+        auto &api = detail::npy_api::get();
+        return reinterpret_steal<array>(api.PyArray_Squeeze_(m_ptr));
+    }
+
+    /// Resize array to given shape
+    /// If refcheck is true and more that one reference exist to this array
+    /// then resize will succeed only if it makes a reshape, i.e. original size doesn't change
+    void resize(ShapeContainer new_shape, bool refcheck = true) {
+        detail::npy_api::PyArray_Dims d
+            = {// Use reinterpret_cast for PyPy on Windows (remove if fixed, checked on 7.3.1)
+               reinterpret_cast<Py_intptr_t *>(new_shape->data()),
+               int(new_shape->size())};
+        // try to resize, set ordering param to -1 cause it's not used anyway
+        auto new_array = reinterpret_steal<object>(
+            detail::npy_api::get().PyArray_Resize_(m_ptr, &d, int(refcheck), -1));
+        if (!new_array) {
+            throw error_already_set();
+        }
+        if (isinstance<array>(new_array)) {
+            *this = std::move(new_array);
+        }
+    }
+
+    /// Optional `order` parameter omitted, to be added as needed.
+    array reshape(ShapeContainer new_shape) {
+        detail::npy_api::PyArray_Dims d
+            = {reinterpret_cast<Py_intptr_t *>(new_shape->data()), int(new_shape->size())};
+        auto new_array
+            = reinterpret_steal<array>(detail::npy_api::get().PyArray_Newshape_(m_ptr, &d, 0));
+        if (!new_array) {
+            throw error_already_set();
+        }
+        return new_array;
+    }
+
+    /// Create a view of an array in a different data type.
+    /// This function may fundamentally reinterpret the data in the array.
+    /// It is the responsibility of the caller to ensure that this is safe.
+    /// Only supports the `dtype` argument, the `type` argument is omitted,
+    /// to be added as needed.
+    array view(const std::string &dtype) {
+        auto &api = detail::npy_api::get();
+        auto new_view = reinterpret_steal<array>(api.PyArray_View_(
+            m_ptr, dtype::from_args(pybind11::str(dtype)).release().ptr(), nullptr));
+        if (!new_view) {
+            throw error_already_set();
+        }
+        return new_view;
+    }
+
+    /// Ensure that the argument is a NumPy array
+    /// In case of an error, nullptr is returned and the Python error is cleared.
+    static array ensure(handle h, int ExtraFlags = 0) {
+        auto result = reinterpret_steal<array>(raw_array(h.ptr(), ExtraFlags));
+        if (!result) {
+            PyErr_Clear();
+        }
+        return result;
+    }
+
+protected:
+    template <typename, typename>
+    friend struct detail::npy_format_descriptor;
+
+    void fail_dim_check(ssize_t dim, const std::string &msg) const {
+        throw index_error(msg + ": " + std::to_string(dim) + " (ndim = " + std::to_string(ndim())
+                          + ')');
+    }
+
+    template <typename... Ix>
+    ssize_t byte_offset(Ix... index) const {
+        check_dimensions(index...);
+        return detail::byte_offset_unsafe(strides(), ssize_t(index)...);
+    }
+
+    void check_writeable() const {
+        if (!writeable()) {
+            throw std::domain_error("array is not writeable");
+        }
+    }
+
+    template <typename... Ix>
+    void check_dimensions(Ix... index) const {
+        check_dimensions_impl(ssize_t(0), shape(), ssize_t(index)...);
+    }
+
+    void check_dimensions_impl(ssize_t, const ssize_t *) const {}
+
+    template <typename... Ix>
+    void check_dimensions_impl(ssize_t axis, const ssize_t *shape, ssize_t i, Ix... index) const {
+        if (i >= *shape) {
+            throw index_error(std::string("index ") + std::to_string(i)
+                              + " is out of bounds for axis " + std::to_string(axis)
+                              + " with size " + std::to_string(*shape));
+        }
+        check_dimensions_impl(axis + 1, shape + 1, index...);
+    }
+
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array(PyObject *ptr, int ExtraFlags = 0) {
+        if (ptr == nullptr) {
+            set_error(PyExc_ValueError, "cannot create a pybind11::array from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(
+            ptr, nullptr, 0, 0, detail::npy_api::NPY_ARRAY_ENSUREARRAY_ | ExtraFlags, nullptr);
+    }
+};
+
+template <typename T, int ExtraFlags = array::forcecast>
+class array_t : public array {
+private:
+    struct private_ctor {};
+    // Delegating constructor needed when both moving and accessing in the same constructor
+    array_t(private_ctor,
+            ShapeContainer &&shape,
+            StridesContainer &&strides,
+            const T *ptr,
+            handle base)
+        : array(std::move(shape), std::move(strides), ptr, base) {}
+
+public:
+    static_assert(!detail::array_info<T>::is_array, "Array types cannot be used with array_t");
+
+    using value_type = T;
+
+    array_t() : array(0, static_cast<const T *>(nullptr)) {}
+    array_t(handle h, borrowed_t) : array(h, borrowed_t{}) {}
+    array_t(handle h, stolen_t) : array(h, stolen_t{}) {}
+
+    PYBIND11_DEPRECATED("Use array_t<T>::ensure() instead")
+    array_t(handle h, bool is_borrowed) : array(raw_array_t(h.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            PyErr_Clear();
+        }
+        if (!is_borrowed) {
+            Py_XDECREF(h.ptr());
+        }
+    }
+
+    // NOLINTNEXTLINE(google-explicit-constructor)
+    array_t(const object &o) : array(raw_array_t(o.ptr()), stolen_t{}) {
+        if (!m_ptr) {
+            throw error_already_set();
+        }
+    }
+
+    explicit array_t(const buffer_info &info, handle base = handle()) : array(info, base) {}
+
+    array_t(ShapeContainer shape,
+            StridesContainer strides,
+            const T *ptr = nullptr,
+            handle base = handle())
+        : array(std::move(shape), std::move(strides), ptr, base) {}
+
+    explicit array_t(ShapeContainer shape, const T *ptr = nullptr, handle base = handle())
+        : array_t(private_ctor{},
+                  std::move(shape),
+                  (ExtraFlags & f_style) != 0 ? detail::f_strides(*shape, itemsize())
+                                              : detail::c_strides(*shape, itemsize()),
+                  ptr,
+                  base) {}
+
+    explicit array_t(ssize_t count, const T *ptr = nullptr, handle base = handle())
+        : array({count}, {}, ptr, base) {}
+
+    constexpr ssize_t itemsize() const { return sizeof(T); }
+
+    template <typename... Ix>
+    ssize_t index_at(Ix... index) const {
+        return offset_at(index...) / itemsize();
+    }
+
+    template <typename... Ix>
+    const T *data(Ix... index) const {
+        return static_cast<const T *>(array::data(index...));
+    }
+
+    template <typename... Ix>
+    T *mutable_data(Ix... index) {
+        return static_cast<T *>(array::mutable_data(index...));
+    }
+
+    // Reference to element at a given index
+    template <typename... Ix>
+    const T &at(Ix... index) const {
+        if ((ssize_t) sizeof...(index) != ndim()) {
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        }
+        return *(static_cast<const T *>(array::data())
+                 + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    // Mutable reference to element at a given index
+    template <typename... Ix>
+    T &mutable_at(Ix... index) {
+        if ((ssize_t) sizeof...(index) != ndim()) {
+            fail_dim_check(sizeof...(index), "index dimension mismatch");
+        }
+        return *(static_cast<T *>(array::mutable_data())
+                 + byte_offset(ssize_t(index)...) / itemsize());
+    }
+
+    /**
+     * Returns a proxy object that provides access to the array's data without bounds or
+     * dimensionality checking.  Will throw if the array is missing the `writeable` flag.  Use with
+     * care: the array must not be destroyed or reshaped for the duration of the returned object,
+     * and the caller must take care not to access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1>
+    detail::unchecked_mutable_reference<T, Dims> mutable_unchecked() & {
+        return array::mutable_unchecked<T, Dims>();
+    }
+
+    /**
+     * Returns a proxy object that provides const access to the array's data without bounds or
+     * dimensionality checking.  Unlike `mutable_unchecked()`, this does not require that the
+     * underlying array have the `writable` flag.  Use with care: the array must not be destroyed
+     * or reshaped for the duration of the returned object, and the caller must take care not to
+     * access invalid dimensions or dimension indices.
+     */
+    template <ssize_t Dims = -1>
+    detail::unchecked_reference<T, Dims> unchecked() const & {
+        return array::unchecked<T, Dims>();
+    }
+
+    /// Ensure that the argument is a NumPy array of the correct dtype (and if not, try to convert
+    /// it).  In case of an error, nullptr is returned and the Python error is cleared.
+    static array_t ensure(handle h) {
+        auto result = reinterpret_steal<array_t>(raw_array_t(h.ptr()));
+        if (!result) {
+            PyErr_Clear();
+        }
+        return result;
+    }
+
+    static bool check_(handle h) {
+        const auto &api = detail::npy_api::get();
+        return api.PyArray_Check_(h.ptr())
+               && api.PyArray_EquivTypes_(detail::array_proxy(h.ptr())->descr,
+                                          dtype::of<T>().ptr())
+               && detail::check_flags(h.ptr(), ExtraFlags & (array::c_style | array::f_style));
+    }
+
+protected:
+    /// Create array from any object -- always returns a new reference
+    static PyObject *raw_array_t(PyObject *ptr) {
+        if (ptr == nullptr) {
+            set_error(PyExc_ValueError, "cannot create a pybind11::array_t from a nullptr");
+            return nullptr;
+        }
+        return detail::npy_api::get().PyArray_FromAny_(ptr,
+                                                       dtype::of<T>().release().ptr(),
+                                                       0,
+                                                       0,
+                                                       detail::npy_api::NPY_ARRAY_ENSUREARRAY_
+                                                           | ExtraFlags,
+                                                       nullptr);
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static std::string format() {
+        return detail::npy_format_descriptor<typename std::remove_cv<T>::type>::format();
+    }
+};
+
+template <size_t N>
+struct format_descriptor<char[N]> {
+    static std::string format() { return std::to_string(N) + 's'; }
+};
+template <size_t N>
+struct format_descriptor<std::array<char, N>> {
+    static std::string format() { return std::to_string(N) + 's'; }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<std::is_enum<T>::value>> {
+    static std::string format() {
+        return format_descriptor<
+            typename std::remove_cv<typename std::underlying_type<T>::type>::type>::format();
+    }
+};
+
+template <typename T>
+struct format_descriptor<T, detail::enable_if_t<detail::array_info<T>::is_array>> {
+    static std::string format() {
+        using namespace detail;
+        static constexpr auto extents = const_name("(") + array_info<T>::extents + const_name(")");
+        return extents.text + format_descriptor<remove_all_extents_t<T>>::format();
+    }
+};
+
+PYBIND11_NAMESPACE_BEGIN(detail)
+template <typename T, int ExtraFlags>
+struct pyobject_caster<array_t<T, ExtraFlags>> {
+    using type = array_t<T, ExtraFlags>;
+
+    bool load(handle src, bool convert) {
+        if (!convert && !type::check_(src)) {
+            return false;
+        }
+        value = type::ensure(src);
+        return static_cast<bool>(value);
+    }
+
+    static handle cast(const handle &src, return_value_policy /* policy */, handle /* parent */) {
+        return src.inc_ref();
+    }
+    PYBIND11_TYPE_CASTER(type, handle_type_name<type>::name);
+};
+
+template <typename T>
+struct compare_buffer_info<T, detail::enable_if_t<detail::is_pod_struct<T>::value>> {
+    static bool compare(const buffer_info &b) {
+        return npy_api::get().PyArray_EquivTypes_(dtype::of<T>().ptr(), dtype(b).ptr());
+    }
+};
+
+template <typename T, typename = void>
+struct npy_format_descriptor_name;
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_integral<T>::value>> {
+    static constexpr auto name = const_name<std::is_same<T, bool>::value>(
+        const_name("bool"),
+        const_name<std::is_signed<T>::value>("numpy.int", "numpy.uint")
+            + const_name<sizeof(T) * 8>());
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<std::is_floating_point<T>::value>> {
+    static constexpr auto name = const_name < std::is_same<T, float>::value
+                                 || std::is_same<T, const float>::value
+                                 || std::is_same<T, double>::value
+                                 || std::is_same<T, const double>::value
+                                        > (const_name("numpy.float") + const_name<sizeof(T) * 8>(),
+                                           const_name("numpy.longdouble"));
+};
+
+template <typename T>
+struct npy_format_descriptor_name<T, enable_if_t<is_complex<T>::value>> {
+    static constexpr auto name = const_name < std::is_same<typename T::value_type, float>::value
+                                 || std::is_same<typename T::value_type, const float>::value
+                                 || std::is_same<typename T::value_type, double>::value
+                                 || std::is_same<typename T::value_type, const double>::value
+                                        > (const_name("numpy.complex")
+                                               + const_name<sizeof(typename T::value_type) * 16>(),
+                                           const_name("numpy.longcomplex"));
+};
+
+template <typename T>
+struct npy_format_descriptor<
+    T,
+    enable_if_t<satisfies_any_of<T, std::is_arithmetic, is_complex>::value>>
+    : npy_format_descriptor_name<T> {
+private:
+    // NB: the order here must match the one in common.h
+    constexpr static const int values[15] = {npy_api::NPY_BOOL_,
+                                             npy_api::NPY_BYTE_,
+                                             npy_api::NPY_UBYTE_,
+                                             npy_api::NPY_INT16_,
+                                             npy_api::NPY_UINT16_,
+                                             npy_api::NPY_INT32_,
+                                             npy_api::NPY_UINT32_,
+                                             npy_api::NPY_INT64_,
+                                             npy_api::NPY_UINT64_,
+                                             npy_api::NPY_FLOAT_,
+                                             npy_api::NPY_DOUBLE_,
+                                             npy_api::NPY_LONGDOUBLE_,
+                                             npy_api::NPY_CFLOAT_,
+                                             npy_api::NPY_CDOUBLE_,
+                                             npy_api::NPY_CLONGDOUBLE_};
+
+public:
+    static constexpr int value = values[detail::is_fmt_numeric<T>::index];
+
+    static pybind11::dtype dtype() { return pybind11::dtype(/*typenum*/ value); }
+};
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<is_same_ignoring_cvref<T, PyObject *>::value>> {
+    static constexpr auto name = const_name("object");
+
+    static constexpr int value = npy_api::NPY_OBJECT_;
+
+    static pybind11::dtype dtype() { return pybind11::dtype(/*typenum*/ value); }
+};
+
+#define PYBIND11_DECL_CHAR_FMT                                                                    \
+    static constexpr auto name = const_name("S") + const_name<N>();                               \
+    static pybind11::dtype dtype() {                                                              \
+        return pybind11::dtype(std::string("S") + std::to_string(N));                             \
+    }
+template <size_t N>
+struct npy_format_descriptor<char[N]> {
+    PYBIND11_DECL_CHAR_FMT
+};
+template <size_t N>
+struct npy_format_descriptor<std::array<char, N>> {
+    PYBIND11_DECL_CHAR_FMT
+};
+#undef PYBIND11_DECL_CHAR_FMT
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<array_info<T>::is_array>> {
+private:
+    using base_descr = npy_format_descriptor<typename array_info<T>::type>;
+
+public:
+    static_assert(!array_info<T>::is_empty, "Zero-sized arrays are not supported");
+
+    static constexpr auto name
+        = const_name("(") + array_info<T>::extents + const_name(")") + base_descr::name;
+    static pybind11::dtype dtype() {
+        list shape;
+        array_info<T>::append_extents(shape);
+        return pybind11::dtype::from_args(
+            pybind11::make_tuple(base_descr::dtype(), std::move(shape)));
+    }
+};
+
+template <typename T>
+struct npy_format_descriptor<T, enable_if_t<std::is_enum<T>::value>> {
+private:
+    using base_descr = npy_format_descriptor<typename std::underlying_type<T>::type>;
+
+public:
+    static constexpr auto name = base_descr::name;
+    static pybind11::dtype dtype() { return base_descr::dtype(); }
+};
+
+struct field_descriptor {
+    const char *name;
+    ssize_t offset;
+    ssize_t size;
+    std::string format;
+    dtype descr;
+};
+
+PYBIND11_NOINLINE void register_structured_dtype(any_container<field_descriptor> fields,
+                                                 const std::type_info &tinfo,
+                                                 ssize_t itemsize,
+                                                 bool (*direct_converter)(PyObject *, void *&)) {
+
+    auto &numpy_internals = get_numpy_internals();
+    if (numpy_internals.get_type_info(tinfo, false)) {
+        pybind11_fail("NumPy: dtype is already registered");
+    }
+
+    // Use ordered fields because order matters as of NumPy 1.14:
+    // https://docs.scipy.org/doc/numpy/release.html#multiple-field-indexing-assignment-of-structured-arrays
+    std::vector<field_descriptor> ordered_fields(std::move(fields));
+    std::sort(
+        ordered_fields.begin(),
+        ordered_fields.end(),
+        [](const field_descriptor &a, const field_descriptor &b) { return a.offset < b.offset; });
+
+    list names, formats, offsets;
+    for (auto &field : ordered_fields) {
+        if (!field.descr) {
+            pybind11_fail(std::string("NumPy: unsupported field dtype: `") + field.name + "` @ "
+                          + tinfo.name());
+        }
+        names.append(pybind11::str(field.name));
+        formats.append(field.descr);
+        offsets.append(pybind11::int_(field.offset));
+    }
+    auto *dtype_ptr
+        = pybind11::dtype(std::move(names), std::move(formats), std::move(offsets), itemsize)
+              .release()
+              .ptr();
+
+    // There is an existing bug in NumPy (as of v1.11): trailing bytes are
+    // not encoded explicitly into the format string. This will supposedly
+    // get fixed in v1.12; for further details, see these:
+    // - https://github.com/numpy/numpy/issues/7797
+    // - https://github.com/numpy/numpy/pull/7798
+    // Because of this, we won't use numpy's logic to generate buffer format
+    // strings and will just do it ourselves.
+    ssize_t offset = 0;
+    std::ostringstream oss;
+    // mark the structure as unaligned with '^', because numpy and C++ don't
+    // always agree about alignment (particularly for complex), and we're
+    // explicitly listing all our padding. This depends on none of the fields
+    // overriding the endianness. Putting the ^ in front of individual fields
+    // isn't guaranteed to work due to https://github.com/numpy/numpy/issues/9049
+    oss << "^T{";
+    for (auto &field : ordered_fields) {
+        if (field.offset > offset) {
+            oss << (field.offset - offset) << 'x';
+        }
+        oss << field.format << ':' << field.name << ':';
+        offset = field.offset + field.size;
+    }
+    if (itemsize > offset) {
+        oss << (itemsize - offset) << 'x';
+    }
+    oss << '}';
+    auto format_str = oss.str();
+
+    // Smoke test: verify that NumPy properly parses our buffer format string
+    auto &api = npy_api::get();
+    auto arr = array(buffer_info(nullptr, itemsize, format_str, 1));
+    if (!api.PyArray_EquivTypes_(dtype_ptr, arr.dtype().ptr())) {
+        pybind11_fail("NumPy: invalid buffer descriptor!");
+    }
+
+    auto tindex = std::type_index(tinfo);
+    numpy_internals.registered_dtypes[tindex] = {dtype_ptr, std::move(format_str)};
+    with_internals([tindex, &direct_converter](internals &internals) {
+        internals.direct_conversions[tindex].push_back(direct_converter);
+    });
+}
+
+template <typename T, typename SFINAE>
+struct npy_format_descriptor {
+    static_assert(is_pod_struct<T>::value,
+                  "Attempt to use a non-POD or unimplemented POD type as a numpy dtype");
+
+    static constexpr auto name = make_caster<T>::name;
+
+    static pybind11::dtype dtype() { return reinterpret_borrow<pybind11::dtype>(dtype_ptr()); }
+
+    static std::string format() {
+        static auto format_str = get_numpy_internals().get_type_info<T>(true)->format_str;
+        return format_str;
+    }
+
+    static void register_dtype(any_container<field_descriptor> fields) {
+        register_structured_dtype(std::move(fields),
+                                  typeid(typename std::remove_cv<T>::type),
+                                  sizeof(T),
+                                  &direct_converter);
+    }
+
+private:
+    static PyObject *dtype_ptr() {
+        static PyObject *ptr = get_numpy_internals().get_type_info<T>(true)->dtype_ptr;
+        return ptr;
+    }
+
+    static bool direct_converter(PyObject *obj, void *&value) {
+        auto &api = npy_api::get();
+        if (!PyObject_TypeCheck(obj, api.PyVoidArrType_Type_)) {
+            return false;
+        }
+        if (auto descr = reinterpret_steal<object>(api.PyArray_DescrFromScalar_(obj))) {
+            if (api.PyArray_EquivTypes_(dtype_ptr(), descr.ptr())) {
+                value = ((PyVoidScalarObject_Proxy *) obj)->obval;
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+#ifdef __CLION_IDE__ // replace heavy macro with dummy code for the IDE (doesn't affect code)
+#    define PYBIND11_NUMPY_DTYPE(Type, ...) ((void) 0)
+#    define PYBIND11_NUMPY_DTYPE_EX(Type, ...) ((void) 0)
+#else
+
+#    define PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, Name)                                          \
+        ::pybind11::detail::field_descriptor {                                                    \
+            Name, offsetof(T, Field), sizeof(decltype(std::declval<T>().Field)),                  \
+                ::pybind11::format_descriptor<decltype(std::declval<T>().Field)>::format(),       \
+                ::pybind11::detail::npy_format_descriptor<                                        \
+                    decltype(std::declval<T>().Field)>::dtype()                                   \
+        }
+
+// Extract name, offset and format descriptor for a struct field
+#    define PYBIND11_FIELD_DESCRIPTOR(T, Field) PYBIND11_FIELD_DESCRIPTOR_EX(T, Field, #Field)
+
+// The main idea of this macro is borrowed from https://github.com/swansontec/map-macro
+// (C) William Swanson, Paul Fultz
+#    define PYBIND11_EVAL0(...) __VA_ARGS__
+#    define PYBIND11_EVAL1(...) PYBIND11_EVAL0(PYBIND11_EVAL0(PYBIND11_EVAL0(__VA_ARGS__)))
+#    define PYBIND11_EVAL2(...) PYBIND11_EVAL1(PYBIND11_EVAL1(PYBIND11_EVAL1(__VA_ARGS__)))
+#    define PYBIND11_EVAL3(...) PYBIND11_EVAL2(PYBIND11_EVAL2(PYBIND11_EVAL2(__VA_ARGS__)))
+#    define PYBIND11_EVAL4(...) PYBIND11_EVAL3(PYBIND11_EVAL3(PYBIND11_EVAL3(__VA_ARGS__)))
+#    define PYBIND11_EVAL(...) PYBIND11_EVAL4(PYBIND11_EVAL4(PYBIND11_EVAL4(__VA_ARGS__)))
+#    define PYBIND11_MAP_END(...)
+#    define PYBIND11_MAP_OUT
+#    define PYBIND11_MAP_COMMA ,
+#    define PYBIND11_MAP_GET_END() 0, PYBIND11_MAP_END
+#    define PYBIND11_MAP_NEXT0(test, next, ...) next PYBIND11_MAP_OUT
+#    define PYBIND11_MAP_NEXT1(test, next) PYBIND11_MAP_NEXT0(test, next, 0)
+#    define PYBIND11_MAP_NEXT(test, next) PYBIND11_MAP_NEXT1(PYBIND11_MAP_GET_END test, next)
+#    if defined(_MSC_VER)                                                                         \
+        && !defined(__clang__) // MSVC is not as eager to expand macros, hence this workaround
+#        define PYBIND11_MAP_LIST_NEXT1(test, next)                                               \
+            PYBIND11_EVAL0(PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0))
+#    else
+#        define PYBIND11_MAP_LIST_NEXT1(test, next)                                               \
+            PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0)
+#    endif
+#    define PYBIND11_MAP_LIST_NEXT(test, next)                                                    \
+        PYBIND11_MAP_LIST_NEXT1(PYBIND11_MAP_GET_END test, next)
+#    define PYBIND11_MAP_LIST0(f, t, x, peek, ...)                                                \
+        f(t, x) PYBIND11_MAP_LIST_NEXT(peek, PYBIND11_MAP_LIST1)(f, t, peek, __VA_ARGS__)
+#    define PYBIND11_MAP_LIST1(f, t, x, peek, ...)                                                \
+        f(t, x) PYBIND11_MAP_LIST_NEXT(peek, PYBIND11_MAP_LIST0)(f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP_LIST(f, t, a1, a2, ...) expands to f(t, a1), f(t, a2), ...
+#    define PYBIND11_MAP_LIST(f, t, ...)                                                          \
+        PYBIND11_EVAL(PYBIND11_MAP_LIST1(f, t, __VA_ARGS__, (), 0))
+
+#    define PYBIND11_NUMPY_DTYPE(Type, ...)                                                       \
+        ::pybind11::detail::npy_format_descriptor<Type>::register_dtype(                          \
+            ::std::vector<::pybind11::detail::field_descriptor>{                                  \
+                PYBIND11_MAP_LIST(PYBIND11_FIELD_DESCRIPTOR, Type, __VA_ARGS__)})
+
+#    if defined(_MSC_VER) && !defined(__clang__)
+#        define PYBIND11_MAP2_LIST_NEXT1(test, next)                                              \
+            PYBIND11_EVAL0(PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0))
+#    else
+#        define PYBIND11_MAP2_LIST_NEXT1(test, next)                                              \
+            PYBIND11_MAP_NEXT0(test, PYBIND11_MAP_COMMA next, 0)
+#    endif
+#    define PYBIND11_MAP2_LIST_NEXT(test, next)                                                   \
+        PYBIND11_MAP2_LIST_NEXT1(PYBIND11_MAP_GET_END test, next)
+#    define PYBIND11_MAP2_LIST0(f, t, x1, x2, peek, ...)                                          \
+        f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT(peek, PYBIND11_MAP2_LIST1)(f, t, peek, __VA_ARGS__)
+#    define PYBIND11_MAP2_LIST1(f, t, x1, x2, peek, ...)                                          \
+        f(t, x1, x2) PYBIND11_MAP2_LIST_NEXT(peek, PYBIND11_MAP2_LIST0)(f, t, peek, __VA_ARGS__)
+// PYBIND11_MAP2_LIST(f, t, a1, a2, ...) expands to f(t, a1, a2), f(t, a3, a4), ...
+#    define PYBIND11_MAP2_LIST(f, t, ...)                                                         \
+        PYBIND11_EVAL(PYBIND11_MAP2_LIST1(f, t, __VA_ARGS__, (), 0))
+
+#    define PYBIND11_NUMPY_DTYPE_EX(Type, ...)                                                    \
+        ::pybind11::detail::npy_format_descriptor<Type>::register_dtype(                          \
+            ::std::vector<::pybind11::detail::field_descriptor>{                                  \
+                PYBIND11_MAP2_LIST(PYBIND11_FIELD_DESCRIPTOR_EX, Type, __VA_ARGS__)})
+
+#endif // __CLION_IDE__
+
+class common_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+    using value_type = container_type::value_type;
+    using size_type = container_type::size_type;
+
+    common_iterator() : m_strides() {}
+
+    common_iterator(void *ptr, const container_type &strides, const container_type &shape)
+        : p_ptr(reinterpret_cast<char *>(ptr)), m_strides(strides.size()) {
+        m_strides.back() = static_cast<value_type>(strides.back());
+        for (size_type i = m_strides.size() - 1; i != 0; --i) {
+            size_type j = i - 1;
+            auto s = static_cast<value_type>(shape[i]);
+            m_strides[j] = strides[j] + m_strides[i] - strides[i] * s;
+        }
+    }
+
+    void increment(size_type dim) { p_ptr += m_strides[dim]; }
+
+    void *data() const { return p_ptr; }
+
+private:
+    char *p_ptr{nullptr};
+    container_type m_strides;
+};
+
+template <size_t N>
+class multi_array_iterator {
+public:
+    using container_type = std::vector<ssize_t>;
+
+    multi_array_iterator(const std::array<buffer_info, N> &buffers, const container_type &shape)
+        : m_shape(shape.size()), m_index(shape.size(), 0), m_common_iterator() {
+
+        // Manual copy to avoid conversion warning if using std::copy
+        for (size_t i = 0; i < shape.size(); ++i) {
+            m_shape[i] = shape[i];
+        }
+
+        container_type strides(shape.size());
+        for (size_t i = 0; i < N; ++i) {
+            init_common_iterator(buffers[i], shape, m_common_iterator[i], strides);
+        }
+    }
+
+    multi_array_iterator &operator++() {
+        for (size_t j = m_index.size(); j != 0; --j) {
+            size_t i = j - 1;
+            if (++m_index[i] != m_shape[i]) {
+                increment_common_iterator(i);
+                break;
+            }
+            m_index[i] = 0;
+        }
+        return *this;
+    }
+
+    template <size_t K, class T = void>
+    T *data() const {
+        return reinterpret_cast<T *>(m_common_iterator[K].data());
+    }
+
+private:
+    using common_iter = common_iterator;
+
+    void init_common_iterator(const buffer_info &buffer,
+                              const container_type &shape,
+                              common_iter &iterator,
+                              container_type &strides) {
+        auto buffer_shape_iter = buffer.shape.rbegin();
+        auto buffer_strides_iter = buffer.strides.rbegin();
+        auto shape_iter = shape.rbegin();
+        auto strides_iter = strides.rbegin();
+
+        while (buffer_shape_iter != buffer.shape.rend()) {
+            if (*shape_iter == *buffer_shape_iter) {
+                *strides_iter = *buffer_strides_iter;
+            } else {
+                *strides_iter = 0;
+            }
+
+            ++buffer_shape_iter;
+            ++buffer_strides_iter;
+            ++shape_iter;
+            ++strides_iter;
+        }
+
+        std::fill(strides_iter, strides.rend(), 0);
+        iterator = common_iter(buffer.ptr, strides, shape);
+    }
+
+    void increment_common_iterator(size_t dim) {
+        for (auto &iter : m_common_iterator) {
+            iter.increment(dim);
+        }
+    }
+
+    container_type m_shape;
+    container_type m_index;
+    std::array<common_iter, N> m_common_iterator;
+};
+
+enum class broadcast_trivial { non_trivial, c_trivial, f_trivial };
+
+// Populates the shape and number of dimensions for the set of buffers.  Returns a
+// broadcast_trivial enum value indicating whether the broadcast is "trivial"--that is, has each
+// buffer being either a singleton or a full-size, C-contiguous (`c_trivial`) or Fortran-contiguous
+// (`f_trivial`) storage buffer; returns `non_trivial` otherwise.
+template <size_t N>
+broadcast_trivial
+broadcast(const std::array<buffer_info, N> &buffers, ssize_t &ndim, std::vector<ssize_t> &shape) {
+    ndim = std::accumulate(
+        buffers.begin(), buffers.end(), ssize_t(0), [](ssize_t res, const buffer_info &buf) {
+            return std::max(res, buf.ndim);
+        });
+
+    shape.clear();
+    shape.resize((size_t) ndim, 1);
+
+    // Figure out the output size, and make sure all input arrays conform (i.e. are either size 1
+    // or the full size).
+    for (size_t i = 0; i < N; ++i) {
+        auto res_iter = shape.rbegin();
+        auto end = buffers[i].shape.rend();
+        for (auto shape_iter = buffers[i].shape.rbegin(); shape_iter != end;
+             ++shape_iter, ++res_iter) {
+            const auto &dim_size_in = *shape_iter;
+            auto &dim_size_out = *res_iter;
+
+            // Each input dimension can either be 1 or `n`, but `n` values must match across
+            // buffers
+            if (dim_size_out == 1) {
+                dim_size_out = dim_size_in;
+            } else if (dim_size_in != 1 && dim_size_in != dim_size_out) {
+                pybind11_fail("pybind11::vectorize: incompatible size/dimension of inputs!");
+            }
+        }
+    }
+
+    bool trivial_broadcast_c = true;
+    bool trivial_broadcast_f = true;
+    for (size_t i = 0; i < N && (trivial_broadcast_c || trivial_broadcast_f); ++i) {
+        if (buffers[i].size == 1) {
+            continue;
+        }
+
+        // Require the same number of dimensions:
+        if (buffers[i].ndim != ndim) {
+            return broadcast_trivial::non_trivial;
+        }
+
+        // Require all dimensions be full-size:
+        if (!std::equal(buffers[i].shape.cbegin(), buffers[i].shape.cend(), shape.cbegin())) {
+            return broadcast_trivial::non_trivial;
+        }
+
+        // Check for C contiguity (but only if previous inputs were also C contiguous)
+        if (trivial_broadcast_c) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.crend();
+            for (auto shape_iter = buffers[i].shape.crbegin(),
+                      stride_iter = buffers[i].strides.crbegin();
+                 trivial_broadcast_c && shape_iter != end;
+                 ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter) {
+                    expect_stride *= *shape_iter;
+                } else {
+                    trivial_broadcast_c = false;
+                }
+            }
+        }
+
+        // Check for Fortran contiguity (if previous inputs were also F contiguous)
+        if (trivial_broadcast_f) {
+            ssize_t expect_stride = buffers[i].itemsize;
+            auto end = buffers[i].shape.cend();
+            for (auto shape_iter = buffers[i].shape.cbegin(),
+                      stride_iter = buffers[i].strides.cbegin();
+                 trivial_broadcast_f && shape_iter != end;
+                 ++shape_iter, ++stride_iter) {
+                if (expect_stride == *stride_iter) {
+                    expect_stride *= *shape_iter;
+                } else {
+                    trivial_broadcast_f = false;
+                }
+            }
+        }
+    }
+
+    return trivial_broadcast_c   ? broadcast_trivial::c_trivial
+           : trivial_broadcast_f ? broadcast_trivial::f_trivial
+                                 : broadcast_trivial::non_trivial;
+}
+
+template <typename T>
+struct vectorize_arg {
+    static_assert(!std::is_rvalue_reference<T>::value,
+                  "Functions with rvalue reference arguments cannot be vectorized");
+    // The wrapped function gets called with this type:
+    using call_type = remove_reference_t<T>;
+    // Is this a vectorized argument?
+    static constexpr bool vectorize
+        = satisfies_any_of<call_type, std::is_arithmetic, is_complex, is_pod>::value
+          && satisfies_none_of<call_type,
+                               std::is_pointer,
+                               std::is_array,
+                               is_std_array,
+                               std::is_enum>::value
+          && (!std::is_reference<T>::value
+              || (std::is_lvalue_reference<T>::value && std::is_const<call_type>::value));
+    // Accept this type: an array for vectorized types, otherwise the type as-is:
+    using type = conditional_t<vectorize, array_t<remove_cv_t<call_type>, array::forcecast>, T>;
+};
+
+// py::vectorize when a return type is present
+template <typename Func, typename Return, typename... Args>
+struct vectorize_returned_array {
+    using Type = array_t<Return>;
+
+    static Type create(broadcast_trivial trivial, const std::vector<ssize_t> &shape) {
+        if (trivial == broadcast_trivial::f_trivial) {
+            return array_t<Return, array::f_style>(shape);
+        }
+        return array_t<Return>(shape);
+    }
+
+    static Return *mutable_data(Type &array) { return array.mutable_data(); }
+
+    static Return call(Func &f, Args &...args) { return f(args...); }
+
+    static void call(Return *out, size_t i, Func &f, Args &...args) { out[i] = f(args...); }
+};
+
+// py::vectorize when a return type is not present
+template <typename Func, typename... Args>
+struct vectorize_returned_array<Func, void, Args...> {
+    using Type = none;
+
+    static Type create(broadcast_trivial, const std::vector<ssize_t> &) { return none(); }
+
+    static void *mutable_data(Type &) { return nullptr; }
+
+    static detail::void_type call(Func &f, Args &...args) {
+        f(args...);
+        return {};
+    }
+
+    static void call(void *, size_t, Func &f, Args &...args) { f(args...); }
+};
+
+template <typename Func, typename Return, typename... Args>
+struct vectorize_helper {
+
+// NVCC for some reason breaks if NVectorized is private
+#ifdef __CUDACC__
+public:
+#else
+private:
+#endif
+
+    static constexpr size_t N = sizeof...(Args);
+    static constexpr size_t NVectorized = constexpr_sum(vectorize_arg<Args>::vectorize...);
+    static_assert(
+        NVectorized >= 1,
+        "pybind11::vectorize(...) requires a function with at least one vectorizable argument");
+
+public:
+    template <typename T,
+              // SFINAE to prevent shadowing the copy constructor.
+              typename = detail::enable_if_t<
+                  !std::is_same<vectorize_helper, typename std::decay<T>::type>::value>>
+    explicit vectorize_helper(T &&f) : f(std::forward<T>(f)) {}
+
+    object operator()(typename vectorize_arg<Args>::type... args) {
+        return run(args...,
+                   make_index_sequence<N>(),
+                   select_indices<vectorize_arg<Args>::vectorize...>(),
+                   make_index_sequence<NVectorized>());
+    }
+
+private:
+    remove_reference_t<Func> f;
+
+    // Internal compiler error in MSVC 19.16.27025.1 (Visual Studio 2017 15.9.4), when compiling
+    // with "/permissive-" flag when arg_call_types is manually inlined.
+    using arg_call_types = std::tuple<typename vectorize_arg<Args>::call_type...>;
+    template <size_t Index>
+    using param_n_t = typename std::tuple_element<Index, arg_call_types>::type;
+
+    using returned_array = vectorize_returned_array<Func, Return, Args...>;
+
+    // Runs a vectorized function given arguments tuple and three index sequences:
+    //     - Index is the full set of 0 ... (N-1) argument indices;
+    //     - VIndex is the subset of argument indices with vectorized parameters, letting us access
+    //       vectorized arguments (anything not in this sequence is passed through)
+    //     - BIndex is a incremental sequence (beginning at 0) of the same size as VIndex, so that
+    //       we can store vectorized buffer_infos in an array (argument VIndex has its buffer at
+    //       index BIndex in the array).
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    object run(typename vectorize_arg<Args>::type &...args,
+               index_sequence<Index...> i_seq,
+               index_sequence<VIndex...> vi_seq,
+               index_sequence<BIndex...> bi_seq) {
+
+        // Pointers to values the function was called with; the vectorized ones set here will start
+        // out as array_t<T> pointers, but they will be changed them to T pointers before we make
+        // call the wrapped function.  Non-vectorized pointers are left as-is.
+        std::array<void *, N> params{{reinterpret_cast<void *>(&args)...}};
+
+        // The array of `buffer_info`s of vectorized arguments:
+        std::array<buffer_info, NVectorized> buffers{
+            {reinterpret_cast<array *>(params[VIndex])->request()...}};
+
+        /* Determine dimensions parameters of output array */
+        ssize_t nd = 0;
+        std::vector<ssize_t> shape(0);
+        auto trivial = broadcast(buffers, nd, shape);
+        auto ndim = (size_t) nd;
+
+        size_t size
+            = std::accumulate(shape.begin(), shape.end(), (size_t) 1, std::multiplies<size_t>());
+
+        // If all arguments are 0-dimension arrays (i.e. single values) return a plain value (i.e.
+        // not wrapped in an array).
+        if (size == 1 && ndim == 0) {
+            PYBIND11_EXPAND_SIDE_EFFECTS(params[VIndex] = buffers[BIndex].ptr);
+            return cast(
+                returned_array::call(f, *reinterpret_cast<param_n_t<Index> *>(params[Index])...));
+        }
+
+        auto result = returned_array::create(trivial, shape);
+
+        PYBIND11_WARNING_PUSH
+#ifdef PYBIND11_DETECTED_CLANG_WITH_MISLEADING_CALL_STD_MOVE_EXPLICITLY_WARNING
+        PYBIND11_WARNING_DISABLE_CLANG("-Wreturn-std-move")
+#endif
+
+        if (size == 0) {
+            return result;
+        }
+
+        /* Call the function */
+        auto *mutable_data = returned_array::mutable_data(result);
+        if (trivial == broadcast_trivial::non_trivial) {
+            apply_broadcast(buffers, params, mutable_data, size, shape, i_seq, vi_seq, bi_seq);
+        } else {
+            apply_trivial(buffers, params, mutable_data, size, i_seq, vi_seq, bi_seq);
+        }
+
+        return result;
+        PYBIND11_WARNING_POP
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_trivial(std::array<buffer_info, NVectorized> &buffers,
+                       std::array<void *, N> &params,
+                       Return *out,
+                       size_t size,
+                       index_sequence<Index...>,
+                       index_sequence<VIndex...>,
+                       index_sequence<BIndex...>) {
+
+        // Initialize an array of mutable byte references and sizes with references set to the
+        // appropriate pointer in `params`; as we iterate, we'll increment each pointer by its size
+        // (except for singletons, which get an increment of 0).
+        std::array<std::pair<unsigned char *&, const size_t>, NVectorized> vecparams{
+            {std::pair<unsigned char *&, const size_t>(
+                reinterpret_cast<unsigned char *&>(params[VIndex] = buffers[BIndex].ptr),
+                buffers[BIndex].size == 1 ? 0 : sizeof(param_n_t<VIndex>))...}};
+
+        for (size_t i = 0; i < size; ++i) {
+            returned_array::call(
+                out, i, f, *reinterpret_cast<param_n_t<Index> *>(params[Index])...);
+            for (auto &x : vecparams) {
+                x.first += x.second;
+            }
+        }
+    }
+
+    template <size_t... Index, size_t... VIndex, size_t... BIndex>
+    void apply_broadcast(std::array<buffer_info, NVectorized> &buffers,
+                         std::array<void *, N> &params,
+                         Return *out,
+                         size_t size,
+                         const std::vector<ssize_t> &output_shape,
+                         index_sequence<Index...>,
+                         index_sequence<VIndex...>,
+                         index_sequence<BIndex...>) {
+
+        multi_array_iterator<NVectorized> input_iter(buffers, output_shape);
+
+        for (size_t i = 0; i < size; ++i, ++input_iter) {
+            PYBIND11_EXPAND_SIDE_EFFECTS((params[VIndex] = input_iter.template data<BIndex>()));
+            returned_array::call(
+                out, i, f, *reinterpret_cast<param_n_t<Index> *>(std::get<Index>(params))...);
+        }
+    }
+};
+
+template <typename Func, typename Return, typename... Args>
+vectorize_helper<Func, Return, Args...> vectorize_extractor(const Func &f, Return (*)(Args...)) {
+    return detail::vectorize_helper<Func, Return, Args...>(f);
+}
+
+template <typename T, int Flags>
+struct handle_type_name<array_t<T, Flags>> {
+    static constexpr auto name
+        = const_name("numpy.ndarray[") + npy_format_descriptor<T>::name + const_name("]");
+};
+
+PYBIND11_NAMESPACE_END(detail)
+
+// Vanilla pointer vectorizer:
+template <typename Return, typename... Args>
+detail::vectorize_helper<Return (*)(Args...), Return, Args...> vectorize(Return (*f)(Args...)) {
+    return detail::vectorize_helper<Return (*)(Args...), Return, Args...>(f);
+}
+
+// lambda vectorizer:
+template <typename Func, detail::enable_if_t<detail::is_lambda<Func>::value, int> = 0>
+auto vectorize(Func &&f)
+    -> decltype(detail::vectorize_extractor(std::forward<Func>(f),
+                                            (detail::function_signature_t<Func> *) nullptr)) {
+    return detail::vectorize_extractor(std::forward<Func>(f),
+                                       (detail::function_signature_t<Func> *) nullptr);
+}
+
+// Vectorize a class method (non-const):
+template <typename Return,
+          typename Class,
+          typename... Args,
+          typename Helper = detail::vectorize_helper<
+              decltype(std::mem_fn(std::declval<Return (Class::*)(Args...)>())),
+              Return,
+              Class *,
+              Args...>>
+Helper vectorize(Return (Class::*f)(Args...)) {
+    return Helper(std::mem_fn(f));
+}
+
+// Vectorize a class method (const):
+template <typename Return,
+          typename Class,
+          typename... Args,
+          typename Helper = detail::vectorize_helper<
+              decltype(std::mem_fn(std::declval<Return (Class::*)(Args...) const>())),
+              Return,
+              const Class *,
+              Args...>>
+Helper vectorize(Return (Class::*f)(Args...) const) {
+    return Helper(std::mem_fn(f));
+}
+
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/stl/filesystem.h b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/stl/filesystem.h
new file mode 100644
index 0000000000000000000000000000000000000000..c16a9ae5c2b076cc93671772de04e70c9075893d
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/include/pybind11/stl/filesystem.h
@@ -0,0 +1,124 @@
+// Copyright (c) 2021 The Pybind Development Team.
+// All rights reserved. Use of this source code is governed by a
+// BSD-style license that can be found in the LICENSE file.
+
+#pragma once
+
+#include <pybind11/cast.h>
+#include <pybind11/detail/common.h>
+#include <pybind11/detail/descr.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/pytypes.h>
+
+#include <string>
+
+#ifdef __has_include
+#    if defined(PYBIND11_CPP17)
+#        if __has_include(<filesystem>)
+#            include <filesystem>
+#            define PYBIND11_HAS_FILESYSTEM 1
+#        elif __has_include(<experimental/filesystem>)
+#            include <experimental/filesystem>
+#            define PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM 1
+#        endif
+#    endif
+#endif
+
+#if !defined(PYBIND11_HAS_FILESYSTEM) && !defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM)           \
+    && !defined(PYBIND11_HAS_FILESYSTEM_IS_OPTIONAL)
+#    error                                                                                        \
+        "Neither #include <filesystem> nor #include <experimental/filesystem is available. (Use -DPYBIND11_HAS_FILESYSTEM_IS_OPTIONAL to ignore.)"
+#endif
+
+PYBIND11_NAMESPACE_BEGIN(PYBIND11_NAMESPACE)
+PYBIND11_NAMESPACE_BEGIN(detail)
+
+#ifdef PYPY_VERSION
+#    define PYBIND11_REINTERPRET_CAST_VOID_PTR_IF_NOT_PYPY(...) (__VA_ARGS__)
+#else
+#    define PYBIND11_REINTERPRET_CAST_VOID_PTR_IF_NOT_PYPY(...)                                   \
+        (reinterpret_cast<void *>(__VA_ARGS__))
+#endif
+
+#if defined(PYBIND11_HAS_FILESYSTEM) || defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM)
+template <typename T>
+struct path_caster {
+
+private:
+    static PyObject *unicode_from_fs_native(const std::string &w) {
+#    if !defined(PYPY_VERSION)
+        return PyUnicode_DecodeFSDefaultAndSize(w.c_str(), ssize_t(w.size()));
+#    else
+        // PyPy mistakenly declares the first parameter as non-const.
+        return PyUnicode_DecodeFSDefaultAndSize(const_cast<char *>(w.c_str()), ssize_t(w.size()));
+#    endif
+    }
+
+    static PyObject *unicode_from_fs_native(const std::wstring &w) {
+        return PyUnicode_FromWideChar(w.c_str(), ssize_t(w.size()));
+    }
+
+public:
+    static handle cast(const T &path, return_value_policy, handle) {
+        if (auto py_str = unicode_from_fs_native(path.native())) {
+            return module_::import("pathlib")
+                .attr("Path")(reinterpret_steal<object>(py_str))
+                .release();
+        }
+        return nullptr;
+    }
+
+    bool load(handle handle, bool) {
+        // PyUnicode_FSConverter and PyUnicode_FSDecoder normally take care of
+        // calling PyOS_FSPath themselves, but that's broken on PyPy (PyPy
+        // issue #3168) so we do it ourselves instead.
+        PyObject *buf = PyOS_FSPath(handle.ptr());
+        if (!buf) {
+            PyErr_Clear();
+            return false;
+        }
+        PyObject *native = nullptr;
+        if constexpr (std::is_same_v<typename T::value_type, char>) {
+            if (PyUnicode_FSConverter(buf, PYBIND11_REINTERPRET_CAST_VOID_PTR_IF_NOT_PYPY(&native))
+                != 0) {
+                if (auto *c_str = PyBytes_AsString(native)) {
+                    // AsString returns a pointer to the internal buffer, which
+                    // must not be free'd.
+                    value = c_str;
+                }
+            }
+        } else if constexpr (std::is_same_v<typename T::value_type, wchar_t>) {
+            if (PyUnicode_FSDecoder(buf, PYBIND11_REINTERPRET_CAST_VOID_PTR_IF_NOT_PYPY(&native))
+                != 0) {
+                if (auto *c_str = PyUnicode_AsWideCharString(native, nullptr)) {
+                    // AsWideCharString returns a new string that must be free'd.
+                    value = c_str; // Copies the string.
+                    PyMem_Free(c_str);
+                }
+            }
+        }
+        Py_XDECREF(native);
+        Py_DECREF(buf);
+        if (PyErr_Occurred()) {
+            PyErr_Clear();
+            return false;
+        }
+        return true;
+    }
+
+    PYBIND11_TYPE_CASTER(T, const_name("os.PathLike"));
+};
+
+#endif // PYBIND11_HAS_FILESYSTEM || defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM)
+
+#if defined(PYBIND11_HAS_FILESYSTEM)
+template <>
+struct type_caster<std::filesystem::path> : public path_caster<std::filesystem::path> {};
+#elif defined(PYBIND11_HAS_EXPERIMENTAL_FILESYSTEM)
+template <>
+struct type_caster<std::experimental::filesystem::path>
+    : public path_caster<std::experimental::filesystem::path> {};
+#endif
+
+PYBIND11_NAMESPACE_END(detail)
+PYBIND11_NAMESPACE_END(PYBIND11_NAMESPACE)
diff --git a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/share/cmake/pybind11/pybind11Common.cmake b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/share/cmake/pybind11/pybind11Common.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7d8d94b11d1d627a604dace61a8b0bb546f77d7b
--- /dev/null
+++ b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/pybind11/share/cmake/pybind11/pybind11Common.cmake
@@ -0,0 +1,455 @@
+#[======================================================[.rst
+
+Adds the following targets::
+
+    pybind11::pybind11 - link to Python headers and pybind11::headers
+    pybind11::module - Adds module links
+    pybind11::embed - Adds embed links
+    pybind11::lto - Link time optimizations (only if CMAKE_INTERPROCEDURAL_OPTIMIZATION is not set)
+    pybind11::thin_lto - Link time optimizations (only if CMAKE_INTERPROCEDURAL_OPTIMIZATION is not set)
+    pybind11::python_link_helper - Adds link to Python libraries
+    pybind11::windows_extras - MSVC bigobj and mp for building multithreaded
+    pybind11::opt_size - avoid optimizations that increase code size
+
+Adds the following functions::
+
+    pybind11_strip(target) - strip target after building on linux/macOS
+    pybind11_find_import(module) - See if a module is installed.
+
+#]======================================================]
+
+# CMake 3.10 has an include_guard command, but we can't use that yet
+# include_guard(global) (pre-CMake 3.10)
+if(TARGET pybind11::pybind11)
+  return()
+endif()
+
+# If we are in subdirectory mode, all IMPORTED targets must be GLOBAL. If we
+# are in CONFIG mode, they should be "normal" targets instead.
+# In CMake 3.11+ you can promote a target to global after you create it,
+# which might be simpler than this check.
+get_property(
+  is_config
+  TARGET pybind11::headers
+  PROPERTY IMPORTED)
+if(NOT is_config)
+  set(optional_global GLOBAL)
+endif()
+
+# If not run in Python mode, we still would like this to at least
+# include pybind11's include directory:
+set(pybind11_INCLUDE_DIRS
+    "${pybind11_INCLUDE_DIR}"
+    CACHE INTERNAL "Include directory for pybind11 (Python not requested)")
+
+if(CMAKE_CROSSCOMPILING AND PYBIND11_USE_CROSSCOMPILING)
+  set(_PYBIND11_CROSSCOMPILING
+      ON
+      CACHE INTERNAL "")
+else()
+  set(_PYBIND11_CROSSCOMPILING
+      OFF
+      CACHE INTERNAL "")
+endif()
+
+# --------------------- Shared targets ----------------------------
+
+# Build an interface library target:
+add_library(pybind11::pybind11 IMPORTED INTERFACE ${optional_global})
+set_property(
+  TARGET pybind11::pybind11
+  APPEND
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::headers)
+
+# Build a module target:
+add_library(pybind11::module IMPORTED INTERFACE ${optional_global})
+set_property(
+  TARGET pybind11::module
+  APPEND
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::pybind11)
+
+# Build an embed library target:
+add_library(pybind11::embed IMPORTED INTERFACE ${optional_global})
+set_property(
+  TARGET pybind11::embed
+  APPEND
+  PROPERTY INTERFACE_LINK_LIBRARIES pybind11::pybind11)
+
+# -------------- emscripten requires exceptions enabled -------------
+# _pybind11_no_exceptions is a private mechanism to disable this addition.
+# Please open an issue if you need to use it; it will be removed if no one
+# needs it.
+if(CMAKE_SYSTEM_NAME MATCHES Emscripten AND NOT _pybind11_no_exceptions)
+  if(CMAKE_VERSION VERSION_LESS 3.13)
+    message(WARNING "CMake 3.13+ is required to build for Emscripten. Some flags will be missing")
+  else()
+    if(is_config)
+      set(_tmp_config_target pybind11::pybind11_headers)
+    else()
+      set(_tmp_config_target pybind11_headers)
+    endif()
+
+    set_property(
+      TARGET ${_tmp_config_target}
+      APPEND
+      PROPERTY INTERFACE_LINK_OPTIONS -fexceptions)
+    set_property(
+      TARGET ${_tmp_config_target}
+      APPEND
+      PROPERTY INTERFACE_COMPILE_OPTIONS -fexceptions)
+    unset(_tmp_config_target)
+  endif()
+endif()
+
+# --------------------------- link helper ---------------------------
+
+add_library(pybind11::python_link_helper IMPORTED INTERFACE ${optional_global})
+
+if(CMAKE_VERSION VERSION_LESS 3.13)
+  # In CMake 3.11+, you can set INTERFACE properties via the normal methods, and
+  # this would be simpler.
+  set_property(
+    TARGET pybind11::python_link_helper
+    APPEND
+    PROPERTY INTERFACE_LINK_LIBRARIES "$<$<PLATFORM_ID:Darwin>:-undefined dynamic_lookup>")
+else()
+  # link_options was added in 3.13+
+  # This is safer, because you are ensured the deduplication pass in CMake will not consider
+  # these separate and remove one but not the other.
+  set_property(
+    TARGET pybind11::python_link_helper
+    APPEND
+    PROPERTY INTERFACE_LINK_OPTIONS "$<$<PLATFORM_ID:Darwin>:LINKER:-undefined,dynamic_lookup>")
+endif()
+
+# ------------------------ Windows extras -------------------------
+
+add_library(pybind11::windows_extras IMPORTED INTERFACE ${optional_global})
+
+if(MSVC) # That's also clang-cl
+  # /bigobj is needed for bigger binding projects due to the limit to 64k
+  # addressable sections
+  set_property(
+    TARGET pybind11::windows_extras
+    APPEND
+    PROPERTY INTERFACE_COMPILE_OPTIONS $<$<COMPILE_LANGUAGE:CXX>:/bigobj>)
+
+  # /MP enables multithreaded builds (relevant when there are many files) for MSVC
+  if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") # no Clang no Intel
+    if(CMAKE_VERSION VERSION_LESS 3.11)
+      set_property(
+        TARGET pybind11::windows_extras
+        APPEND
+        PROPERTY INTERFACE_COMPILE_OPTIONS $<$<NOT:$<CONFIG:Debug>>:/MP>)
+    else()
+      # Only set these options for C++ files.  This is important so that, for
+      # instance, projects that include other types of source files like CUDA
+      # .cu files don't get these options propagated to nvcc since that would
+      # cause the build to fail.
+      set_property(
+        TARGET pybind11::windows_extras
+        APPEND
+        PROPERTY INTERFACE_COMPILE_OPTIONS
+                 $<$<NOT:$<CONFIG:Debug>>:$<$<COMPILE_LANGUAGE:CXX>:/MP>>)
+    endif()
+  endif()
+endif()
+
+# ----------------------- Optimize binary size --------------------------
+
+add_library(pybind11::opt_size IMPORTED INTERFACE ${optional_global})
+
+if(MSVC)
+  set(PYBIND11_OPT_SIZE /Os)
+else()
+  set(PYBIND11_OPT_SIZE -Os)
+endif()
+
+set_property(
+  TARGET pybind11::opt_size
+  APPEND
+  PROPERTY INTERFACE_COMPILE_OPTIONS $<$<CONFIG:Release>:${PYBIND11_OPT_SIZE}>
+           $<$<CONFIG:MinSizeRel>:${PYBIND11_OPT_SIZE}>
+           $<$<CONFIG:RelWithDebInfo>:${PYBIND11_OPT_SIZE}>)
+
+# ----------------------- Legacy option --------------------------
+
+# Warn or error if old variable name used
+if(PYBIND11_CPP_STANDARD)
+  string(REGEX MATCH [[..$]] VAL "${PYBIND11_CPP_STANDARD}")
+  if(CMAKE_CXX_STANDARD)
+    if(NOT CMAKE_CXX_STANDARD STREQUAL VAL)
+      message(WARNING "CMAKE_CXX_STANDARD=${CMAKE_CXX_STANDARD} does not match "
+                      "PYBIND11_CPP_STANDARD=${PYBIND11_CPP_STANDARD}, "
+                      "please remove PYBIND11_CPP_STANDARD from your cache")
+    endif()
+  else()
+    set(supported_standards 11 14 17 20)
+    if("${VAL}" IN_LIST supported_standards)
+      message(WARNING "USE -DCMAKE_CXX_STANDARD=${VAL} instead of PYBIND11_CPP_STANDARD")
+      set(CMAKE_CXX_STANDARD
+          ${VAL}
+          CACHE STRING "From PYBIND11_CPP_STANDARD")
+    else()
+      message(FATAL_ERROR "PYBIND11_CPP_STANDARD should be replaced with CMAKE_CXX_STANDARD "
+                          "(last two chars: ${VAL} not understood as a valid CXX std)")
+    endif()
+  endif()
+endif()
+
+# --------------------- Python specifics -------------------------
+
+# CMake 3.27 removes the classic FindPythonInterp if CMP0148 is NEW
+if(CMAKE_VERSION VERSION_LESS "3.27")
+  set(_pybind11_missing_old_python "OLD")
+else()
+  cmake_policy(GET CMP0148 _pybind11_missing_old_python)
+endif()
+
+# Check to see which Python mode we are in, new, old, or no python
+if(PYBIND11_NOPYTHON)
+  set(_pybind11_nopython ON)
+  # We won't use new FindPython if PYBIND11_FINDPYTHON is defined and falselike
+  # Otherwise, we use if FindPythonLibs is missing or if FindPython was already used
+elseif(
+  (NOT DEFINED PYBIND11_FINDPYTHON OR PYBIND11_FINDPYTHON)
+  AND (_pybind11_missing_old_python STREQUAL "NEW"
+       OR PYBIND11_FINDPYTHON
+       OR Python_FOUND
+       OR Python3_FOUND
+      ))
+
+  # New mode
+  include("${CMAKE_CURRENT_LIST_DIR}/pybind11NewTools.cmake")
+
+else()
+
+  # Classic mode
+  include("${CMAKE_CURRENT_LIST_DIR}/pybind11Tools.cmake")
+
+endif()
+
+# --------------------- pybind11_find_import -------------------------------
+
+if(NOT _pybind11_nopython AND NOT _PYBIND11_CROSSCOMPILING)
+  # Check to see if modules are importable. Use REQUIRED to force an error if
+  # one of the modules is not found. <package_name>_FOUND will be set if the
+  # package was found (underscores replace dashes if present). QUIET will hide
+  # the found message, and VERSION will require a minimum version. A successful
+  # find will cache the result.
+  function(pybind11_find_import PYPI_NAME)
+    # CMake variables need underscores (PyPI doesn't care)
+    string(REPLACE "-" "_" NORM_PYPI_NAME "${PYPI_NAME}")
+
+    # Return if found previously
+    if(${NORM_PYPI_NAME}_FOUND)
+      return()
+    endif()
+
+    set(options "REQUIRED;QUIET")
+    set(oneValueArgs "VERSION")
+    cmake_parse_arguments(ARG "${options}" "${oneValueArgs}" "" ${ARGN})
+
+    if(ARG_REQUIRED)
+      set(status_level FATAL_ERROR)
+    else()
+      set(status_level WARNING)
+    endif()
+
+    execute_process(
+      COMMAND
+        ${${_Python}_EXECUTABLE} -c "
+try:
+    from importlib.metadata import version
+except ImportError:
+    from pkg_resources import get_distribution
+    def version(s):
+        return get_distribution(s).version
+print(version('${PYPI_NAME}'))
+        "
+      RESULT_VARIABLE RESULT_PRESENT
+      OUTPUT_VARIABLE PKG_VERSION
+      ERROR_QUIET)
+
+    string(STRIP "${PKG_VERSION}" PKG_VERSION)
+
+    # If a result is present, this failed
+    if(RESULT_PRESENT)
+      set(${NORM_PYPI_NAME}_FOUND
+          ${NORM_PYPI_NAME}-NOTFOUND
+          CACHE INTERNAL "")
+      # Always warn or error
+      message(
+        ${status_level}
+        "Missing: ${PYPI_NAME} ${ARG_VERSION}\nTry: ${${_Python}_EXECUTABLE} -m pip install ${PYPI_NAME}"
+      )
+    else()
+      if(ARG_VERSION AND PKG_VERSION VERSION_LESS ARG_VERSION)
+        message(
+          ${status_level}
+          "Version incorrect: ${PYPI_NAME} ${PKG_VERSION} found, ${ARG_VERSION} required - try upgrading"
+        )
+      else()
+        set(${NORM_PYPI_NAME}_FOUND
+            YES
+            CACHE INTERNAL "")
+        set(${NORM_PYPI_NAME}_VERSION
+            ${PKG_VERSION}
+            CACHE INTERNAL "")
+      endif()
+      if(NOT ARG_QUIET)
+        message(STATUS "Found ${PYPI_NAME} ${PKG_VERSION}")
+      endif()
+    endif()
+    if(NOT ARG_VERSION OR (NOT PKG_VERSION VERSION_LESS ARG_VERSION))
+      # We have successfully found a good version, cache to avoid calling again.
+    endif()
+  endfunction()
+endif()
+
+# --------------------- LTO -------------------------------
+
+include(CheckCXXCompilerFlag)
+
+# Checks whether the given CXX/linker flags can compile and link a cxx file.
+# cxxflags and linkerflags are lists of flags to use.  The result variable is a
+# unique variable name for each set of flags: the compilation result will be
+# cached base on the result variable.  If the flags work, sets them in
+# cxxflags_out/linkerflags_out internal cache variables (in addition to
+# ${result}).
+function(_pybind11_return_if_cxx_and_linker_flags_work result cxxflags linkerflags cxxflags_out
+         linkerflags_out)
+  set(CMAKE_REQUIRED_LIBRARIES ${linkerflags})
+  check_cxx_compiler_flag("${cxxflags}" ${result})
+  if(${result})
+    set(${cxxflags_out}
+        "${cxxflags}"
+        PARENT_SCOPE)
+    set(${linkerflags_out}
+        "${linkerflags}"
+        PARENT_SCOPE)
+  endif()
+endfunction()
+
+function(_pybind11_generate_lto target prefer_thin_lto)
+  if(MINGW)
+    message(STATUS "${target} disabled (problems with undefined symbols for MinGW for now)")
+    return()
+  endif()
+
+  if(CMAKE_CXX_COMPILER_ID MATCHES "GNU|Clang")
+    set(cxx_append "")
+    set(linker_append "")
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT APPLE)
+      # Clang Gold plugin does not support -Os; append -O3 to MinSizeRel builds to override it
+      set(linker_append ";$<$<CONFIG:MinSizeRel>:-O3>")
+    elseif(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND NOT MINGW)
+      set(cxx_append ";-fno-fat-lto-objects")
+    endif()
+
+    if(prefer_thin_lto)
+      set(thin "=thin")
+    else()
+      set(thin "")
+    endif()
+
+    if(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64le" OR CMAKE_SYSTEM_PROCESSOR MATCHES "mips64")
+      # Do nothing
+    elseif(CMAKE_SYSTEM_NAME MATCHES Emscripten)
+      # This compile is very costly when cross-compiling, so set this without checking
+      set(PYBIND11_LTO_CXX_FLAGS "-flto${thin}${cxx_append}")
+      set(PYBIND11_LTO_LINKER_FLAGS "-flto${thin}${linker_append}")
+    elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      _pybind11_return_if_cxx_and_linker_flags_work(
+        HAS_FLTO_THIN "-flto${thin}${cxx_append}" "-flto${thin}${linker_append}"
+        PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    endif()
+    if(NOT HAS_FLTO_THIN)
+      _pybind11_return_if_cxx_and_linker_flags_work(
+        HAS_FLTO "-flto${cxx_append}" "-flto${linker_append}" PYBIND11_LTO_CXX_FLAGS
+        PYBIND11_LTO_LINKER_FLAGS)
+    endif()
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES "IntelLLVM")
+    # IntelLLVM equivalent to LTO is called IPO; also IntelLLVM is WIN32/UNIX
+    # WARNING/HELP WANTED: This block of code is currently not covered by pybind11 GitHub Actions!
+    if(WIN32)
+      _pybind11_return_if_cxx_and_linker_flags_work(
+        HAS_INTEL_IPO "-Qipo" "-Qipo" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    else()
+      _pybind11_return_if_cxx_and_linker_flags_work(
+        HAS_INTEL_IPO "-ipo" "-ipo" PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+    endif()
+  elseif(CMAKE_CXX_COMPILER_ID MATCHES "Intel")
+    # Intel equivalent to LTO is called IPO
+    _pybind11_return_if_cxx_and_linker_flags_work(HAS_INTEL_IPO "-ipo" "-ipo"
+                                                  PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+  elseif(MSVC)
+    # cmake only interprets libraries as linker flags when they start with a - (otherwise it
+    # converts /LTCG to \LTCG as if it was a Windows path).  Luckily MSVC supports passing flags
+    # with - instead of /, even if it is a bit non-standard:
+    _pybind11_return_if_cxx_and_linker_flags_work(HAS_MSVC_GL_LTCG "/GL" "-LTCG"
+                                                  PYBIND11_LTO_CXX_FLAGS PYBIND11_LTO_LINKER_FLAGS)
+  endif()
+
+  # Enable LTO flags if found, except for Debug builds
+  if(PYBIND11_LTO_CXX_FLAGS)
+    # CONFIG takes multiple values in CMake 3.19+, until then we have to use OR
+    set(is_debug "$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>")
+    set(not_debug "$<NOT:${is_debug}>")
+    set(cxx_lang "$<COMPILE_LANGUAGE:CXX>")
+    if(MSVC AND CMAKE_VERSION VERSION_LESS 3.11)
+      set(genex "${not_debug}")
+    else()
+      set(genex "$<AND:${not_debug},${cxx_lang}>")
+    endif()
+    set_property(
+      TARGET ${target}
+      APPEND
+      PROPERTY INTERFACE_COMPILE_OPTIONS "$<${genex}:${PYBIND11_LTO_CXX_FLAGS}>")
+    if(CMAKE_PROJECT_NAME STREQUAL "pybind11")
+      message(STATUS "${target} enabled")
+    endif()
+  else()
+    if(CMAKE_PROJECT_NAME STREQUAL "pybind11")
+      message(STATUS "${target} disabled (not supported by the compiler and/or linker)")
+    endif()
+  endif()
+
+  if(PYBIND11_LTO_LINKER_FLAGS)
+    if(CMAKE_VERSION VERSION_LESS 3.11)
+      set_property(
+        TARGET ${target}
+        APPEND
+        PROPERTY INTERFACE_LINK_LIBRARIES "$<${not_debug}:${PYBIND11_LTO_LINKER_FLAGS}>")
+    else()
+      set_property(
+        TARGET ${target}
+        APPEND
+        PROPERTY INTERFACE_LINK_OPTIONS "$<${not_debug}:${PYBIND11_LTO_LINKER_FLAGS}>")
+    endif()
+  endif()
+endfunction()
+
+if(NOT DEFINED CMAKE_INTERPROCEDURAL_OPTIMIZATION)
+  add_library(pybind11::lto IMPORTED INTERFACE ${optional_global})
+  _pybind11_generate_lto(pybind11::lto FALSE)
+
+  add_library(pybind11::thin_lto IMPORTED INTERFACE ${optional_global})
+  _pybind11_generate_lto(pybind11::thin_lto TRUE)
+endif()
+
+# ---------------------- pybind11_strip -----------------------------
+
+function(pybind11_strip target_name)
+  # Strip unnecessary sections of the binary on Linux/macOS
+  if(CMAKE_STRIP)
+    if(APPLE)
+      set(x_opt -x)
+    endif()
+
+    add_custom_command(
+      TARGET ${target_name}
+      POST_BUILD
+      COMMAND ${CMAKE_STRIP} ${x_opt} $<TARGET_FILE:${target_name}>)
+  endif()
+endfunction()