diff --git a/.gitattributes b/.gitattributes
index 93651752679e61d91a4b6c7ee6ef40807128fb1a..0a1aa7e6d18fa08133e1d1c566c2cd00932e1c44 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -120,3 +120,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/click/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/pyasn1/type/__pycache__/univ.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/opencv_python_headless.libs/libvpx-9f572e11.so.9.1.0 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.12 filter=lfs diff=lfs merge=lfs -text
diff --git a/.venv/lib/python3.11/site-packages/nvidia/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a06e3d156e265a8b1a785b12396dfb9464bcb668
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cublas/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cublas/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47c1362208903c6ef96c60829cc1fd3583542e41
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cublas/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/include/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0f7e1ee93b8a6338f663829b17541f316899dad
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h
new file mode 100644
index 0000000000000000000000000000000000000000..96eadad8a8e8c3979b99910ceea41ceaf2c8b58e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h
@@ -0,0 +1,891 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ * This is the public header file for the CUBLAS library, defining the API
+ *
+ * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
+ * on top of the CUDA runtime.
+ */
+
+#if !defined(CUBLAS_H_)
+#define CUBLAS_H_
+
+#if defined(CUBLAS_V2_H_)
+#error "It is an error to include both cublas.h and cublas_v2.h"
+#endif
+
+#include <cuda_runtime.h>
+
+#ifndef CUBLASWINAPI
+#ifdef _WIN32
+#define CUBLASWINAPI __stdcall
+#else
+#define CUBLASWINAPI
+#endif
+#endif
+
+#undef CUBLASAPI
+#ifdef __CUDACC__
+#define CUBLASAPI __host__
+#else
+#define CUBLASAPI
+#endif
+
+#include "cublas_api.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* CUBLAS data types */
+#define cublasStatus cublasStatus_t
+
+cublasStatus CUBLASWINAPI cublasInit(void);
+cublasStatus CUBLASWINAPI cublasShutdown(void);
+cublasStatus CUBLASWINAPI cublasGetError(void);
+
+cublasStatus CUBLASWINAPI cublasGetVersion(int* version);
+cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void** devicePtr);
+
+cublasStatus CUBLASWINAPI cublasFree(void* devicePtr);
+
+cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream);
+
+/* ---------------- CUBLAS BLAS1 functions ---------------- */
+/* NRM2 */
+float CUBLASWINAPI cublasSnrm2(int n, const float* x, int incx);
+double CUBLASWINAPI cublasDnrm2(int n, const double* x, int incx);
+float CUBLASWINAPI cublasScnrm2(int n, const cuComplex* x, int incx);
+double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* DOT */
+float CUBLASWINAPI cublasSdot(int n, const float* x, int incx, const float* y, int incy);
+double CUBLASWINAPI cublasDdot(int n, const double* x, int incx, const double* y, int incy);
+cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
+cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
+cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
+cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* SCAL */
+void CUBLASWINAPI cublasSscal(int n, float alpha, float* x, int incx);
+void CUBLASWINAPI cublasDscal(int n, double alpha, double* x, int incx);
+void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex* x, int incx);
+
+void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* AXPY */
+void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float* x, int incx, float* y, int incy);
+void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double* x, int incx, double* y, int incy);
+void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex* x, int incx, cuComplex* y, int incy);
+void CUBLASWINAPI
+cublasZaxpy(int n, cuDoubleComplex alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* COPY */
+void CUBLASWINAPI cublasScopy(int n, const float* x, int incx, float* y, int incy);
+void CUBLASWINAPI cublasDcopy(int n, const double* x, int incx, double* y, int incy);
+void CUBLASWINAPI cublasCcopy(int n, const cuComplex* x, int incx, cuComplex* y, int incy);
+void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* SWAP */
+void CUBLASWINAPI cublasSswap(int n, float* x, int incx, float* y, int incy);
+void CUBLASWINAPI cublasDswap(int n, double* x, int incx, double* y, int incy);
+void CUBLASWINAPI cublasCswap(int n, cuComplex* x, int incx, cuComplex* y, int incy);
+void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* AMAX */
+int CUBLASWINAPI cublasIsamax(int n, const float* x, int incx);
+int CUBLASWINAPI cublasIdamax(int n, const double* x, int incx);
+int CUBLASWINAPI cublasIcamax(int n, const cuComplex* x, int incx);
+int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* AMIN */
+int CUBLASWINAPI cublasIsamin(int n, const float* x, int incx);
+int CUBLASWINAPI cublasIdamin(int n, const double* x, int incx);
+
+int CUBLASWINAPI cublasIcamin(int n, const cuComplex* x, int incx);
+int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* ASUM */
+float CUBLASWINAPI cublasSasum(int n, const float* x, int incx);
+double CUBLASWINAPI cublasDasum(int n, const double* x, int incx);
+float CUBLASWINAPI cublasScasum(int n, const cuComplex* x, int incx);
+double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* ROT */
+void CUBLASWINAPI cublasSrot(int n, float* x, int incx, float* y, int incy, float sc, float ss);
+void CUBLASWINAPI cublasDrot(int n, double* x, int incx, double* y, int incy, double sc, double ss);
+void CUBLASWINAPI cublasCrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, cuComplex s);
+void CUBLASWINAPI
+cublasZrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double sc, cuDoubleComplex cs);
+void CUBLASWINAPI cublasCsrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, float s);
+void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double c, double s);
+/*------------------------------------------------------------------------*/
+/* ROTG */
+void CUBLASWINAPI cublasSrotg(float* sa, float* sb, float* sc, float* ss);
+void CUBLASWINAPI cublasDrotg(double* sa, double* sb, double* sc, double* ss);
+void CUBLASWINAPI cublasCrotg(cuComplex* ca, cuComplex cb, float* sc, cuComplex* cs);
+void CUBLASWINAPI cublasZrotg(cuDoubleComplex* ca, cuDoubleComplex cb, double* sc, cuDoubleComplex* cs);
+/*------------------------------------------------------------------------*/
+/* ROTM */
+void CUBLASWINAPI cublasSrotm(int n, float* x, int incx, float* y, int incy, const float* sparam);
+void CUBLASWINAPI cublasDrotm(int n, double* x, int incx, double* y, int incy, const double* sparam);
+/*------------------------------------------------------------------------*/
+/* ROTMG */
+void CUBLASWINAPI cublasSrotmg(float* sd1, float* sd2, float* sx1, const float* sy1, float* sparam);
+void CUBLASWINAPI cublasDrotmg(double* sd1, double* sd2, double* sx1, const double* sy1, double* sparam);
+
+/* --------------- CUBLAS BLAS2 functions  ---------------- */
+/* GEMV */
+void CUBLASWINAPI cublasSgemv(char trans,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* x,
+                              int incx,
+                              float beta,
+                              float* y,
+                              int incy);
+void CUBLASWINAPI cublasDgemv(char trans,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasCgemv(char trans,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZgemv(char trans,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* GBMV */
+void CUBLASWINAPI cublasSgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* x,
+                              int incx,
+                              float beta,
+                              float* y,
+                              int incy);
+void CUBLASWINAPI cublasDgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasCgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* TRMV */
+void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
+void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtrmv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
+void CUBLASWINAPI
+cublasZtrmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TBMV */
+void CUBLASWINAPI
+cublasStbmv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
+void CUBLASWINAPI
+cublasDtbmv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtbmv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZtbmv(
+    char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TPMV */
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
+
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
+
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
+
+void CUBLASWINAPI
+cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TRSV */
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
+
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
+
+void CUBLASWINAPI
+cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
+
+void CUBLASWINAPI
+cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TPSV */
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
+
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
+
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
+
+void CUBLASWINAPI
+cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TBSV */
+void CUBLASWINAPI
+cublasStbsv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
+
+void CUBLASWINAPI
+cublasDtbsv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtbsv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
+
+void CUBLASWINAPI cublasZtbsv(
+    char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* SYMV/HEMV */
+void CUBLASWINAPI cublasSsymv(
+    char uplo, int n, float alpha, const float* A, int lda, const float* x, int incx, float beta, float* y, int incy);
+void CUBLASWINAPI cublasDsymv(char uplo,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasChemv(char uplo,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZhemv(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* SBMV/HBMV */
+void CUBLASWINAPI cublasSsbmv(char uplo,
+                              int n,
+                              int k,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* x,
+                              int incx,
+                              float beta,
+                              float* y,
+                              int incy);
+void CUBLASWINAPI cublasDsbmv(char uplo,
+                              int n,
+                              int k,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasChbmv(char uplo,
+                              int n,
+                              int k,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZhbmv(char uplo,
+                              int n,
+                              int k,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* SPMV/HPMV */
+void CUBLASWINAPI
+cublasSspmv(char uplo, int n, float alpha, const float* AP, const float* x, int incx, float beta, float* y, int incy);
+void CUBLASWINAPI cublasDspmv(
+    char uplo, int n, double alpha, const double* AP, const double* x, int incx, double beta, double* y, int incy);
+void CUBLASWINAPI cublasChpmv(char uplo,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* AP,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZhpmv(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* AP,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+
+/*------------------------------------------------------------------------*/
+/* GER */
+void CUBLASWINAPI
+cublasSger(int m, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
+void CUBLASWINAPI
+cublasDger(int m, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
+
+void CUBLASWINAPI cublasCgeru(
+    int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
+void CUBLASWINAPI cublasCgerc(
+    int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
+void CUBLASWINAPI cublasZgeru(int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* A,
+                              int lda);
+void CUBLASWINAPI cublasZgerc(int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* A,
+                              int lda);
+/*------------------------------------------------------------------------*/
+/* SYR/HER */
+void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float* x, int incx, float* A, int lda);
+void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double* x, int incx, double* A, int lda);
+
+void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* A, int lda);
+void CUBLASWINAPI
+cublasZher(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda);
+
+/*------------------------------------------------------------------------*/
+/* SPR/HPR */
+void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float* x, int incx, float* AP);
+void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double* x, int incx, double* AP);
+void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* AP);
+void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP);
+/*------------------------------------------------------------------------*/
+/* SYR2/HER2 */
+void CUBLASWINAPI
+cublasSsyr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
+void CUBLASWINAPI
+cublasDsyr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
+void CUBLASWINAPI cublasCher2(char uplo,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* x,
+                              int incx,
+                              const cuComplex* y,
+                              int incy,
+                              cuComplex* A,
+                              int lda);
+void CUBLASWINAPI cublasZher2(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* A,
+                              int lda);
+
+/*------------------------------------------------------------------------*/
+/* SPR2/HPR2 */
+void CUBLASWINAPI
+cublasSspr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* AP);
+void CUBLASWINAPI
+cublasDspr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* AP);
+void CUBLASWINAPI cublasChpr2(
+    char uplo, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP);
+void CUBLASWINAPI cublasZhpr2(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* AP);
+/* ------------------------BLAS3 Functions ------------------------------- */
+/* GEMM */
+void CUBLASWINAPI cublasSgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* B,
+                              int ldb,
+                              float beta,
+                              float* C,
+                              int ldc);
+void CUBLASWINAPI cublasDgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* B,
+                              int ldb,
+                              double beta,
+                              double* C,
+                              int ldc);
+void CUBLASWINAPI cublasCgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* B,
+                              int ldb,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* B,
+                              int ldb,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/* -------------------------------------------------------*/
+/* SYRK */
+void CUBLASWINAPI
+cublasSsyrk(char uplo, char trans, int n, int k, float alpha, const float* A, int lda, float beta, float* C, int ldc);
+void CUBLASWINAPI cublasDsyrk(
+    char uplo, char trans, int n, int k, double alpha, const double* A, int lda, double beta, double* C, int ldc);
+
+void CUBLASWINAPI cublasCsyrk(char uplo,
+                              char trans,
+                              int n,
+                              int k,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZsyrk(char uplo,
+                              char trans,
+                              int n,
+                              int k,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/* ------------------------------------------------------- */
+/* HERK */
+void CUBLASWINAPI cublasCherk(
+    char uplo, char trans, int n, int k, float alpha, const cuComplex* A, int lda, float beta, cuComplex* C, int ldc);
+void CUBLASWINAPI cublasZherk(char uplo,
+                              char trans,
+                              int n,
+                              int k,
+                              double alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              double beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/* ------------------------------------------------------- */
+/* SYR2K */
+void CUBLASWINAPI cublasSsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               float alpha,
+                               const float* A,
+                               int lda,
+                               const float* B,
+                               int ldb,
+                               float beta,
+                               float* C,
+                               int ldc);
+
+void CUBLASWINAPI cublasDsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               double alpha,
+                               const double* A,
+                               int lda,
+                               const double* B,
+                               int ldb,
+                               double beta,
+                               double* C,
+                               int ldc);
+void CUBLASWINAPI cublasCsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuComplex alpha,
+                               const cuComplex* A,
+                               int lda,
+                               const cuComplex* B,
+                               int ldb,
+                               cuComplex beta,
+                               cuComplex* C,
+                               int ldc);
+
+void CUBLASWINAPI cublasZsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuDoubleComplex alpha,
+                               const cuDoubleComplex* A,
+                               int lda,
+                               const cuDoubleComplex* B,
+                               int ldb,
+                               cuDoubleComplex beta,
+                               cuDoubleComplex* C,
+                               int ldc);
+/* ------------------------------------------------------- */
+/* HER2K */
+void CUBLASWINAPI cublasCher2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuComplex alpha,
+                               const cuComplex* A,
+                               int lda,
+                               const cuComplex* B,
+                               int ldb,
+                               float beta,
+                               cuComplex* C,
+                               int ldc);
+
+void CUBLASWINAPI cublasZher2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuDoubleComplex alpha,
+                               const cuDoubleComplex* A,
+                               int lda,
+                               const cuDoubleComplex* B,
+                               int ldb,
+                               double beta,
+                               cuDoubleComplex* C,
+                               int ldc);
+
+/*------------------------------------------------------------------------*/
+/* SYMM*/
+void CUBLASWINAPI cublasSsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* B,
+                              int ldb,
+                              float beta,
+                              float* C,
+                              int ldc);
+void CUBLASWINAPI cublasDsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* B,
+                              int ldb,
+                              double beta,
+                              double* C,
+                              int ldc);
+
+void CUBLASWINAPI cublasCsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* B,
+                              int ldb,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+
+void CUBLASWINAPI cublasZsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* B,
+                              int ldb,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/*------------------------------------------------------------------------*/
+/* HEMM*/
+void CUBLASWINAPI cublasChemm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* B,
+                              int ldb,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZhemm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* B,
+                              int ldb,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+
+/*------------------------------------------------------------------------*/
+/* TRSM*/
+void CUBLASWINAPI cublasStrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              float* B,
+                              int ldb);
+
+void CUBLASWINAPI cublasDtrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              double* B,
+                              int ldb);
+
+void CUBLASWINAPI cublasCtrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              cuComplex* B,
+                              int ldb);
+
+void CUBLASWINAPI cublasZtrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              cuDoubleComplex* B,
+                              int ldb);
+/*------------------------------------------------------------------------*/
+/* TRMM*/
+void CUBLASWINAPI cublasStrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              float* B,
+                              int ldb);
+void CUBLASWINAPI cublasDtrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              double* B,
+                              int ldb);
+void CUBLASWINAPI cublasCtrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              cuComplex* B,
+                              int ldb);
+void CUBLASWINAPI cublasZtrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              cuDoubleComplex* B,
+                              int ldb);
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif /* !defined(CUBLAS_H_) */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasLt.h b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasLt.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7c9f346cadcad731d90e2f2c75f1549ff68240e
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasLt.h
@@ -0,0 +1,1845 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#pragma once
+
+#ifndef CUBLASAPI
+#ifdef __CUDACC__
+#define CUBLASAPI __host__ __device__
+#else
+#define CUBLASAPI
+#endif
+#endif
+
+#include <cublas_api.h>
+
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+/** Opaque structure holding CUBLASLT context
+ */
+typedef struct cublasLtContext* cublasLtHandle_t;
+
+cublasStatus_t CUBLASWINAPI cublasLtCreate(cublasLtHandle_t* lightHandle);
+
+cublasStatus_t CUBLASWINAPI cublasLtDestroy(cublasLtHandle_t lightHandle);
+
+const char* CUBLASWINAPI cublasLtGetStatusName(cublasStatus_t status);
+
+const char* CUBLASWINAPI cublasLtGetStatusString(cublasStatus_t status);
+
+size_t CUBLASWINAPI cublasLtGetVersion(void);
+
+size_t CUBLASWINAPI cublasLtGetCudartVersion(void);
+
+cublasStatus_t CUBLASWINAPI cublasLtGetProperty(libraryPropertyType type, int* value);
+
+cublasStatus_t CUBLASWINAPI cublasLtHeuristicsCacheGetCapacity(size_t* capacity);
+cublasStatus_t CUBLASWINAPI cublasLtHeuristicsCacheSetCapacity(size_t capacity);
+
+/** Restricts usage of CPU instructions (ISA) specified by the flags in the mask.
+ *
+ * Flags can be combined with bitwise OR(|) operator. Supported flags:
+ * - 0x1 -- x86-64 AVX512 ISA
+ *
+ * Default mask: 0 (any applicable ISA is allowed).
+ *
+ * The function returns the previous value of the mask.
+ * The function takes precedence over the environment variable CUBLASLT_DISABLE_CPU_INSTRUCTIONS_MASK.
+ */
+unsigned CUBLASWINAPI cublasLtDisableCpuInstructionsSetMask(unsigned mask);
+
+/** Semi-opaque descriptor for matrix memory layout
+ */
+typedef struct {
+  uint64_t data[8];
+} cublasLtMatrixLayoutOpaque_t;
+
+/** Opaque descriptor for matrix memory layout
+ */
+typedef cublasLtMatrixLayoutOpaque_t* cublasLtMatrixLayout_t;
+
+/** Semi-opaque algorithm descriptor (to avoid complicated alloc/free schemes)
+ *
+ * This structure can be trivially serialized and later restored for use with the same version of cuBLAS library to save
+ * on selecting the right configuration again.
+ */
+typedef struct {
+  uint64_t data[8];
+} cublasLtMatmulAlgo_t;
+
+/** Semi-opaque descriptor for cublasLtMatmul() operation details
+ */
+typedef struct {
+  uint64_t data[32];
+} cublasLtMatmulDescOpaque_t;
+
+/** Opaque descriptor for cublasLtMatmul() operation details
+ */
+typedef cublasLtMatmulDescOpaque_t* cublasLtMatmulDesc_t;
+
+/** Semi-opaque descriptor for cublasLtMatrixTransform() operation details
+ */
+typedef struct {
+  uint64_t data[8];
+} cublasLtMatrixTransformDescOpaque_t;
+
+/** Opaque descriptor for cublasLtMatrixTransform() operation details
+ */
+typedef cublasLtMatrixTransformDescOpaque_t* cublasLtMatrixTransformDesc_t;
+
+/** Semi-opaque descriptor for cublasLtMatmulPreference() operation details
+ */
+typedef struct {
+  uint64_t data[8];
+} cublasLtMatmulPreferenceOpaque_t;
+
+/** Opaque descriptor for cublasLtMatmulAlgoGetHeuristic() configuration
+ */
+typedef cublasLtMatmulPreferenceOpaque_t* cublasLtMatmulPreference_t;
+
+/** Tile size (in C/D matrix Rows x Cols)
+ *
+ * General order of tile IDs is sorted by size first and by first dimension second.
+ */
+typedef enum {
+  CUBLASLT_MATMUL_TILE_UNDEFINED = 0,
+  CUBLASLT_MATMUL_TILE_8x8 = 1,
+  CUBLASLT_MATMUL_TILE_8x16 = 2,
+  CUBLASLT_MATMUL_TILE_16x8 = 3,
+  CUBLASLT_MATMUL_TILE_8x32 = 4,
+  CUBLASLT_MATMUL_TILE_16x16 = 5,
+  CUBLASLT_MATMUL_TILE_32x8 = 6,
+  CUBLASLT_MATMUL_TILE_8x64 = 7,
+  CUBLASLT_MATMUL_TILE_16x32 = 8,
+  CUBLASLT_MATMUL_TILE_32x16 = 9,
+  CUBLASLT_MATMUL_TILE_64x8 = 10,
+  CUBLASLT_MATMUL_TILE_32x32 = 11,
+  CUBLASLT_MATMUL_TILE_32x64 = 12,
+  CUBLASLT_MATMUL_TILE_64x32 = 13,
+  CUBLASLT_MATMUL_TILE_32x128 = 14,
+  CUBLASLT_MATMUL_TILE_64x64 = 15,
+  CUBLASLT_MATMUL_TILE_128x32 = 16,
+  CUBLASLT_MATMUL_TILE_64x128 = 17,
+  CUBLASLT_MATMUL_TILE_128x64 = 18,
+  CUBLASLT_MATMUL_TILE_64x256 = 19,
+  CUBLASLT_MATMUL_TILE_128x128 = 20,
+  CUBLASLT_MATMUL_TILE_256x64 = 21,
+  CUBLASLT_MATMUL_TILE_64x512 = 22,
+  CUBLASLT_MATMUL_TILE_128x256 = 23,
+  CUBLASLT_MATMUL_TILE_256x128 = 24,
+  CUBLASLT_MATMUL_TILE_512x64 = 25,
+  CUBLASLT_MATMUL_TILE_64x96 = 26,
+  CUBLASLT_MATMUL_TILE_96x64 = 27,
+  CUBLASLT_MATMUL_TILE_96x128 = 28,
+  CUBLASLT_MATMUL_TILE_128x160 = 29,
+  CUBLASLT_MATMUL_TILE_160x128 = 30,
+  CUBLASLT_MATMUL_TILE_192x128 = 31,
+  CUBLASLT_MATMUL_TILE_128x192 = 32,
+  CUBLASLT_MATMUL_TILE_128x96 = 33,
+  CUBLASLT_MATMUL_TILE_32x256 = 34,
+  CUBLASLT_MATMUL_TILE_256x32 = 35,
+  CUBLASLT_MATMUL_TILE_END
+} cublasLtMatmulTile_t;
+
+/** Size and number of stages in which elements are read into shared memory
+ *
+ * General order of stages IDs is sorted by stage size first and by number of stages second.
+ */
+typedef enum {
+  CUBLASLT_MATMUL_STAGES_UNDEFINED = 0,
+  CUBLASLT_MATMUL_STAGES_16x1 = 1,
+  CUBLASLT_MATMUL_STAGES_16x2 = 2,
+  CUBLASLT_MATMUL_STAGES_16x3 = 3,
+  CUBLASLT_MATMUL_STAGES_16x4 = 4,
+  CUBLASLT_MATMUL_STAGES_16x5 = 5,
+  CUBLASLT_MATMUL_STAGES_16x6 = 6,
+  CUBLASLT_MATMUL_STAGES_32x1 = 7,
+  CUBLASLT_MATMUL_STAGES_32x2 = 8,
+  CUBLASLT_MATMUL_STAGES_32x3 = 9,
+  CUBLASLT_MATMUL_STAGES_32x4 = 10,
+  CUBLASLT_MATMUL_STAGES_32x5 = 11,
+  CUBLASLT_MATMUL_STAGES_32x6 = 12,
+  CUBLASLT_MATMUL_STAGES_64x1 = 13,
+  CUBLASLT_MATMUL_STAGES_64x2 = 14,
+  CUBLASLT_MATMUL_STAGES_64x3 = 15,
+  CUBLASLT_MATMUL_STAGES_64x4 = 16,
+  CUBLASLT_MATMUL_STAGES_64x5 = 17,
+  CUBLASLT_MATMUL_STAGES_64x6 = 18,
+  CUBLASLT_MATMUL_STAGES_128x1 = 19,
+  CUBLASLT_MATMUL_STAGES_128x2 = 20,
+  CUBLASLT_MATMUL_STAGES_128x3 = 21,
+  CUBLASLT_MATMUL_STAGES_128x4 = 22,
+  CUBLASLT_MATMUL_STAGES_128x5 = 23,
+  CUBLASLT_MATMUL_STAGES_128x6 = 24,
+  CUBLASLT_MATMUL_STAGES_32x10 = 25,
+  CUBLASLT_MATMUL_STAGES_8x4 = 26,
+  CUBLASLT_MATMUL_STAGES_16x10 = 27,
+  CUBLASLT_MATMUL_STAGES_8x5 = 28,
+  CUBLASLT_MATMUL_STAGES_8x3 = 31,
+  CUBLASLT_MATMUL_STAGES_8xAUTO = 32,
+  CUBLASLT_MATMUL_STAGES_16xAUTO = 33,
+  CUBLASLT_MATMUL_STAGES_32xAUTO = 34,
+  CUBLASLT_MATMUL_STAGES_64xAUTO = 35,
+  CUBLASLT_MATMUL_STAGES_128xAUTO = 36,
+  CUBLASLT_MATMUL_STAGES_END
+} cublasLtMatmulStages_t;
+
+/** Thread Block Cluster size
+ *
+ * Typically dimensioned similar to cublasLtMatmulTile_t, with the third coordinate unused at this time.
+ */
+typedef enum {
+  /** Let library pick cluster shape automatically */
+  CUBLASLT_CLUSTER_SHAPE_AUTO = 0,
+  CUBLASLT_CLUSTER_SHAPE_1x1x1 = 2,
+  CUBLASLT_CLUSTER_SHAPE_2x1x1 = 3,
+  CUBLASLT_CLUSTER_SHAPE_4x1x1 = 4,
+  CUBLASLT_CLUSTER_SHAPE_1x2x1 = 5,
+  CUBLASLT_CLUSTER_SHAPE_2x2x1 = 6,
+  CUBLASLT_CLUSTER_SHAPE_4x2x1 = 7,
+  CUBLASLT_CLUSTER_SHAPE_1x4x1 = 8,
+  CUBLASLT_CLUSTER_SHAPE_2x4x1 = 9,
+  CUBLASLT_CLUSTER_SHAPE_4x4x1 = 10,
+  CUBLASLT_CLUSTER_SHAPE_8x1x1 = 11,
+  CUBLASLT_CLUSTER_SHAPE_1x8x1 = 12,
+  CUBLASLT_CLUSTER_SHAPE_8x2x1 = 13,
+  CUBLASLT_CLUSTER_SHAPE_2x8x1 = 14,
+  CUBLASLT_CLUSTER_SHAPE_16x1x1 = 15,
+  CUBLASLT_CLUSTER_SHAPE_1x16x1 = 16,
+  CUBLASLT_CLUSTER_SHAPE_3x1x1 = 17,
+  CUBLASLT_CLUSTER_SHAPE_5x1x1 = 18,
+  CUBLASLT_CLUSTER_SHAPE_6x1x1 = 19,
+  CUBLASLT_CLUSTER_SHAPE_7x1x1 = 20,
+  CUBLASLT_CLUSTER_SHAPE_9x1x1 = 21,
+  CUBLASLT_CLUSTER_SHAPE_10x1x1 = 22,
+  CUBLASLT_CLUSTER_SHAPE_11x1x1 = 23,
+  CUBLASLT_CLUSTER_SHAPE_12x1x1 = 24,
+  CUBLASLT_CLUSTER_SHAPE_13x1x1 = 25,
+  CUBLASLT_CLUSTER_SHAPE_14x1x1 = 26,
+  CUBLASLT_CLUSTER_SHAPE_15x1x1 = 27,
+  CUBLASLT_CLUSTER_SHAPE_3x2x1 = 28,
+  CUBLASLT_CLUSTER_SHAPE_5x2x1 = 29,
+  CUBLASLT_CLUSTER_SHAPE_6x2x1 = 30,
+  CUBLASLT_CLUSTER_SHAPE_7x2x1 = 31,
+  CUBLASLT_CLUSTER_SHAPE_1x3x1 = 32,
+  CUBLASLT_CLUSTER_SHAPE_2x3x1 = 33,
+  CUBLASLT_CLUSTER_SHAPE_3x3x1 = 34,
+  CUBLASLT_CLUSTER_SHAPE_4x3x1 = 35,
+  CUBLASLT_CLUSTER_SHAPE_5x3x1 = 36,
+  CUBLASLT_CLUSTER_SHAPE_3x4x1 = 37,
+  CUBLASLT_CLUSTER_SHAPE_1x5x1 = 38,
+  CUBLASLT_CLUSTER_SHAPE_2x5x1 = 39,
+  CUBLASLT_CLUSTER_SHAPE_3x5x1 = 40,
+  CUBLASLT_CLUSTER_SHAPE_1x6x1 = 41,
+  CUBLASLT_CLUSTER_SHAPE_2x6x1 = 42,
+  CUBLASLT_CLUSTER_SHAPE_1x7x1 = 43,
+  CUBLASLT_CLUSTER_SHAPE_2x7x1 = 44,
+  CUBLASLT_CLUSTER_SHAPE_1x9x1 = 45,
+  CUBLASLT_CLUSTER_SHAPE_1x10x1 = 46,
+  CUBLASLT_CLUSTER_SHAPE_1x11x1 = 47,
+  CUBLASLT_CLUSTER_SHAPE_1x12x1 = 48,
+  CUBLASLT_CLUSTER_SHAPE_1x13x1 = 49,
+  CUBLASLT_CLUSTER_SHAPE_1x14x1 = 50,
+  CUBLASLT_CLUSTER_SHAPE_1x15x1 = 51,
+  CUBLASLT_CLUSTER_SHAPE_END
+} cublasLtClusterShape_t;
+
+/** Inner size of the kernel
+ *
+ * Represents various aspects of internal kernel design, that don't impact CUDA grid size but may have other more subtle
+ * effects.
+ *
+ */
+typedef enum {
+  CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED = 0,
+  CUBLASLT_MATMUL_INNER_SHAPE_MMA884 = 1,
+  CUBLASLT_MATMUL_INNER_SHAPE_MMA1684 = 2,
+  CUBLASLT_MATMUL_INNER_SHAPE_MMA1688 = 3,
+  CUBLASLT_MATMUL_INNER_SHAPE_MMA16816 = 4,
+  CUBLASLT_MATMUL_INNER_SHAPE_END
+} cublasLtMatmulInnerShape_t;
+
+/** Pointer mode to use for alpha/beta */
+typedef enum {
+  /** matches CUBLAS_POINTER_MODE_HOST, pointer targets a single value host memory */
+  CUBLASLT_POINTER_MODE_HOST = CUBLAS_POINTER_MODE_HOST,
+  /** matches CUBLAS_POINTER_MODE_DEVICE, pointer targets a single value device memory */
+  CUBLASLT_POINTER_MODE_DEVICE = CUBLAS_POINTER_MODE_DEVICE,
+  /** pointer targets an array in device memory */
+  CUBLASLT_POINTER_MODE_DEVICE_VECTOR = 2,
+  /** alpha pointer targets an array in device memory, beta is zero. Note:
+     CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE is not supported, must be 0. */
+  CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO = 3,
+  /** alpha pointer targets an array in device memory, beta is a single value in host memory. */
+  CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST = 4,
+} cublasLtPointerMode_t;
+
+/** Mask to define pointer mode capability */
+typedef enum {
+  /** see CUBLASLT_POINTER_MODE_HOST */
+  CUBLASLT_POINTER_MODE_MASK_HOST = 1,
+  /** see CUBLASLT_POINTER_MODE_DEVICE */
+  CUBLASLT_POINTER_MODE_MASK_DEVICE = 2,
+  /** see CUBLASLT_POINTER_MODE_DEVICE_VECTOR */
+  CUBLASLT_POINTER_MODE_MASK_DEVICE_VECTOR = 4,
+  /** see CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO */
+  CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_ZERO = 8,
+  /** see CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST */
+  CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_HOST = 16,
+} cublasLtPointerModeMask_t;
+
+/** Implementation details that may affect numerical behavior of algorithms. */
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_FMA (0x01ull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_HMMA (0x02ull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_IMMA (0x04ull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_DMMA (0x08ull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_TENSOR_OP_MASK (0xfeull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_TYPE_MASK (0xffull << 0)
+
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_16F (0x01ull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32F (0x02ull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_64F (0x04ull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32I (0x08ull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_TYPE_MASK (0xffull << 8)
+
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16F (0x01ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16BF (0x02ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_TF32 (0x04ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_32F (0x08ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_64F (0x10ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8I (0x20ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E4M3 (0x40ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E5M2 (0x80ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_INPUT_TYPE_MASK (0xffull << 16)
+
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_GAUSSIAN (0x01ull << 32)
+typedef uint64_t cublasLtNumericalImplFlags_t;
+
+/** Execute matrix multiplication (D = alpha * op(A) * op(B) + beta * C).
+ *
+ * \retval     CUBLAS_STATUS_NOT_INITIALIZED   if cuBLASLt handle has not been initialized
+ * \retval     CUBLAS_STATUS_INVALID_VALUE     if parameters are in conflict or in an impossible configuration; e.g.
+ *                                             when workspaceSizeInBytes is less than workspace required by configured
+ *                                             algo
+ * \retval     CUBLAS_STATUS_NOT_SUPPORTED     if current implementation on selected device doesn't support configured
+ *                                             operation
+ * \retval     CUBLAS_STATUS_ARCH_MISMATCH     if configured operation cannot be run using selected device
+ * \retval     CUBLAS_STATUS_EXECUTION_FAILED  if cuda reported execution error from the device
+ * \retval     CUBLAS_STATUS_SUCCESS           if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmul(cublasLtHandle_t lightHandle,
+                                           cublasLtMatmulDesc_t computeDesc,
+                                           const void* alpha, /* host or device pointer */
+                                           const void* A,
+                                           cublasLtMatrixLayout_t Adesc,
+                                           const void* B,
+                                           cublasLtMatrixLayout_t Bdesc,
+                                           const void* beta, /* host or device pointer */
+                                           const void* C,
+                                           cublasLtMatrixLayout_t Cdesc,
+                                           void* D,
+                                           cublasLtMatrixLayout_t Ddesc,
+                                           const cublasLtMatmulAlgo_t* algo,
+                                           void* workspace,
+                                           size_t workspaceSizeInBytes,
+                                           cudaStream_t stream);
+
+/** Matrix layout conversion helper (C = alpha * op(A) + beta * op(B))
+ *
+ * Can be used to change memory order of data or to scale and shift the values.
+ *
+ * \retval     CUBLAS_STATUS_NOT_INITIALIZED   if cuBLASLt handle has not been initialized
+ * \retval     CUBLAS_STATUS_INVALID_VALUE     if parameters are in conflict or in an impossible configuration; e.g.
+ *                                             when A is not NULL, but Adesc is NULL
+ * \retval     CUBLAS_STATUS_NOT_SUPPORTED     if current implementation on selected device doesn't support configured
+ *                                             operation
+ * \retval     CUBLAS_STATUS_ARCH_MISMATCH     if configured operation cannot be run using selected device
+ * \retval     CUBLAS_STATUS_EXECUTION_FAILED  if cuda reported execution error from the device
+ * \retval     CUBLAS_STATUS_SUCCESS           if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransform(cublasLtHandle_t lightHandle,
+                                                    cublasLtMatrixTransformDesc_t transformDesc,
+                                                    const void* alpha, /* host or device pointer */
+                                                    const void* A,
+                                                    cublasLtMatrixLayout_t Adesc,
+                                                    const void* beta, /* host or device pointer */
+                                                    const void* B,
+                                                    cublasLtMatrixLayout_t Bdesc,
+                                                    void* C,
+                                                    cublasLtMatrixLayout_t Cdesc,
+                                                    cudaStream_t stream);
+
+/* ---------------------------------------------------------------------------------------*/
+/* Helper functions for cublasLtMatrixLayout_t */
+/* ---------------------------------------------------------------------------------------*/
+
+/** Enum for data ordering */
+typedef enum {
+  /** Column-major
+   *
+   * Leading dimension is the stride (in elements) to the beginning of next column in memory.
+   */
+  CUBLASLT_ORDER_COL = 0,
+  /** Row major
+   *
+   * Leading dimension is the stride (in elements) to the beginning of next row in memory.
+   */
+  CUBLASLT_ORDER_ROW = 1,
+  /** Column-major ordered tiles of 32 columns.
+   *
+   * Leading dimension is the stride (in elements) to the beginning of next group of 32-columns. E.g. if matrix has 33
+   * columns and 2 rows, ld must be at least (32) * 2 = 64.
+   */
+  CUBLASLT_ORDER_COL32 = 2,
+  /** Column-major ordered tiles of composite tiles with total 32 columns and 8 rows, tile composed of interleaved
+   * inner tiles of 4 columns within 4 even or odd rows in an alternating pattern.
+   *
+   * Leading dimension is the stride (in elements) to the beginning of the first 32 column x 8 row tile for the next
+   * 32-wide group of columns. E.g. if matrix has 33 columns and 1 row, ld must be at least (32 * 8) * 1 = 256.
+   */
+  CUBLASLT_ORDER_COL4_4R2_8C = 3,
+  /** Column-major ordered tiles of composite tiles with total 32 columns ands 32 rows.
+   * Element offset within the tile is calculated as (((row%8)/2*4+row/8)*2+row%2)*32+col.
+   *
+   * Leading dimension is the stride (in elements) to the beginning of the first 32 column x 32 row tile for the next
+   * 32-wide group of columns. E.g. if matrix has 33 columns and 1 row, ld must be at least (32*32)*1 = 1024.
+   */
+  CUBLASLT_ORDER_COL32_2R_4R4 = 4,
+
+} cublasLtOrder_t;
+
+/** Attributes of memory layout */
+typedef enum {
+  /** Data type, see cudaDataType.
+   *
+   * uint32_t
+   */
+  CUBLASLT_MATRIX_LAYOUT_TYPE = 0,
+
+  /** Memory order of the data, see cublasLtOrder_t.
+   *
+   * int32_t, default: CUBLASLT_ORDER_COL
+   */
+  CUBLASLT_MATRIX_LAYOUT_ORDER = 1,
+
+  /** Number of rows.
+   *
+   * Usually only values that can be expressed as int32_t are supported.
+   *
+   * uint64_t
+   */
+  CUBLASLT_MATRIX_LAYOUT_ROWS = 2,
+
+  /** Number of columns.
+   *
+   * Usually only values that can be expressed as int32_t are supported.
+   *
+   * uint64_t
+   */
+  CUBLASLT_MATRIX_LAYOUT_COLS = 3,
+
+  /** Matrix leading dimension.
+   *
+   * For CUBLASLT_ORDER_COL this is stride (in elements) of matrix column, for more details and documentation for
+   * other memory orders see documentation for cublasLtOrder_t values.
+   *
+   * Currently only non-negative values are supported, must be large enough so that matrix memory locations are not
+   * overlapping (e.g. greater or equal to CUBLASLT_MATRIX_LAYOUT_ROWS in case of CUBLASLT_ORDER_COL).
+   *
+   * int64_t;
+   */
+  CUBLASLT_MATRIX_LAYOUT_LD = 4,
+
+  /** Number of matmul operations to perform in the batch.
+   *
+   * See also CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT
+   *
+   * int32_t, default: 1
+   */
+  CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT = 5,
+
+  /** Stride (in elements) to the next matrix for strided batch operation.
+   *
+   * When matrix type is planar-complex (CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET != 0), batch stride
+   * is interpreted by cublasLtMatmul() in number of real valued sub-elements. E.g. for data of type CUDA_C_16F,
+   * offset of 1024B is encoded as a stride of value 512 (since each element of the real and imaginary matrices
+   * is a 2B (16bit) floating point type).
+   *
+   * NOTE: A bug in cublasLtMatrixTransform() causes it to interpret the batch stride for a planar-complex matrix
+   * as if it was specified in number of complex elements. Therefore an offset of 1024B must be encoded as stride
+   * value 256 when calling cublasLtMatrixTransform() (each complex element is 4B with real and imaginary values 2B
+   * each). This behavior is expected to be corrected in the next major cuBLAS version.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET = 6,
+
+  /** Stride (in bytes) to the imaginary plane for planar complex layout.
+   *
+   * int64_t, default: 0 - 0 means that layout is regular (real and imaginary parts of complex numbers are interleaved
+   * in memory in each element)
+   */
+  CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET = 7,
+} cublasLtMatrixLayoutAttribute_t;
+
+/** Internal. Do not use directly.
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutInit_internal(  //
+    cublasLtMatrixLayout_t matLayout,
+    size_t size,
+    cudaDataType type,
+    uint64_t rows,
+    uint64_t cols,
+    int64_t ld);
+
+/** Initialize matrix layout descriptor in pre-allocated space.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+static inline cublasStatus_t cublasLtMatrixLayoutInit(
+    cublasLtMatrixLayout_t matLayout, cudaDataType type, uint64_t rows, uint64_t cols, int64_t ld) {
+  return cublasLtMatrixLayoutInit_internal(matLayout, sizeof(*matLayout), type, rows, cols, ld);
+}
+
+/** Create new matrix layout descriptor.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutCreate(  //
+    cublasLtMatrixLayout_t* matLayout,
+    cudaDataType type,
+    uint64_t rows,
+    uint64_t cols,
+    int64_t ld);
+
+/** Destroy matrix layout descriptor.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutDestroy(cublasLtMatrixLayout_t matLayout);
+
+/** Set matrix layout descriptor attribute.
+ *
+ * \param[in]  matLayout    The descriptor
+ * \param[in]  attr         The attribute
+ * \param[in]  buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutSetAttribute(  //
+    cublasLtMatrixLayout_t matLayout,
+    cublasLtMatrixLayoutAttribute_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+
+/** Get matrix layout descriptor attribute.
+ *
+ * \param[in]  matLayout    The descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutGetAttribute(  //
+    cublasLtMatrixLayout_t matLayout,
+    cublasLtMatrixLayoutAttribute_t attr,
+    void* buf,
+    size_t sizeInBytes,
+    size_t* sizeWritten);
+
+/* ---------------------------------------------------------------------------------------*/
+/* Helper functions for cublasLtMatmulDesc_t */
+/* ---------------------------------------------------------------------------------------*/
+
+/** Matmul descriptor attributes to define details of the operation. */
+typedef enum {
+  /** Compute type, see cudaDataType. Defines data type used for multiply and accumulate operations and the
+   * accumulator during matrix multiplication.
+   *
+   * int32_t
+   */
+  CUBLASLT_MATMUL_DESC_COMPUTE_TYPE = 0,
+
+  /** Scale type, see cudaDataType. Defines data type of alpha and beta. Accumulator and value from matrix C are
+   * typically converted to scale type before final scaling. Value is then converted from scale type to type of matrix
+   * D before being stored in memory.
+   *
+   * int32_t, default: same as CUBLASLT_MATMUL_DESC_COMPUTE_TYPE
+   */
+  CUBLASLT_MATMUL_DESC_SCALE_TYPE = 1,
+
+  /** Pointer mode of alpha and beta, see cublasLtPointerMode_t. When CUBLASLT_POINTER_MODE_DEVICE_VECTOR is in use,
+   * alpha/beta vector lenghts must match number of output matrix rows.
+   *
+   * int32_t, default: CUBLASLT_POINTER_MODE_HOST
+   */
+  CUBLASLT_MATMUL_DESC_POINTER_MODE = 2,
+
+  /** Transform of matrix A, see cublasOperation_t.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATMUL_DESC_TRANSA = 3,
+
+  /** Transform of matrix B, see cublasOperation_t.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATMUL_DESC_TRANSB = 4,
+
+  /** Transform of matrix C, see cublasOperation_t.
+   *
+   * Currently only CUBLAS_OP_N is supported.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATMUL_DESC_TRANSC = 5,
+
+  /** Matrix fill mode, see cublasFillMode_t.
+   *
+   * int32_t, default: CUBLAS_FILL_MODE_FULL
+   */
+  CUBLASLT_MATMUL_DESC_FILL_MODE = 6,
+
+  /** Epilogue function, see cublasLtEpilogue_t.
+   *
+   * uint32_t, default: CUBLASLT_EPILOGUE_DEFAULT
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE = 7,
+
+  /** Bias or bias gradient vector pointer in the device memory.
+   *
+   * Bias case. See CUBLASLT_EPILOGUE_BIAS.
+   * For bias data type see CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE.
+   *
+   * Bias vector length must match matrix D rows count.
+   *
+   * Bias gradient case. See CUBLASLT_EPILOGUE_DRELU_BGRAD and CUBLASLT_EPILOGUE_DGELU_BGRAD.
+   * Bias gradient vector elements are the same type as the output elements
+   * (Ctype) with the exception of IMMA kernels (see above).
+   *
+   * Routines that don't dereference this pointer, like cublasLtMatmulAlgoGetHeuristic()
+   * depend on its value to determine expected pointer alignment.
+   *
+   * Bias case: const void *, default: NULL
+   * Bias gradient case: void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_BIAS_POINTER = 8,
+
+  /** Batch stride for bias or bias gradient vector.
+   *
+   * Used together with CUBLASLT_MATMUL_DESC_BIAS_POINTER when matrix D's CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT > 1.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATMUL_DESC_BIAS_BATCH_STRIDE = 10,
+
+  /** Pointer for epilogue auxiliary buffer.
+   *
+   * - Output vector for ReLu bit-mask in forward pass when CUBLASLT_EPILOGUE_RELU_AUX
+   *   or CUBLASLT_EPILOGUE_RELU_AUX_BIAS epilogue is used.
+   * - Input vector for ReLu bit-mask in backward pass when
+   *   CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is used.
+   *
+   * - Output of GELU input matrix in forward pass when
+   *   CUBLASLT_EPILOGUE_GELU_AUX_BIAS epilogue is used.
+   * - Input of GELU input matrix for backward pass when
+   *   CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue is used.
+   *
+   * For aux data type see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE.
+   *
+   * Routines that don't dereference this pointer, like cublasLtMatmulAlgoGetHeuristic()
+   * depend on its value to determine expected pointer alignment.
+   *
+   * Requires setting CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD attribute.
+   *
+   * Forward pass: void *, default: NULL
+   * Backward pass: const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER = 11,
+
+  /** Leading dimension for epilogue auxiliary buffer.
+   *
+   * - ReLu bit-mask matrix leading dimension in elements (i.e. bits)
+   *   when CUBLASLT_EPILOGUE_RELU_AUX, CUBLASLT_EPILOGUE_RELU_AUX_BIAS or CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is
+   * used. Must be divisible by 128 and be no less than the number of rows in the output matrix.
+   *
+   * - GELU input matrix leading dimension in elements
+   *   when CUBLASLT_EPILOGUE_GELU_AUX_BIAS or CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue used.
+   *   Must be divisible by 8 and be no less than the number of rows in the output matrix.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD = 12,
+
+  /** Batch stride for epilogue auxiliary buffer.
+   *
+   * - ReLu bit-mask matrix batch stride in elements (i.e. bits)
+   *   when CUBLASLT_EPILOGUE_RELU_AUX, CUBLASLT_EPILOGUE_RELU_AUX_BIAS or CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is
+   * used. Must be divisible by 128.
+   *
+   * - GELU input matrix batch stride in elements
+   *   when CUBLASLT_EPILOGUE_GELU_AUX_BIAS or CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue used.
+   *   Must be divisible by 8.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_BATCH_STRIDE = 13,
+
+  /** Batch stride for alpha vector.
+   *
+   * Used together with CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST when matrix D's
+   * CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT > 1. If CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO is set then
+   * CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE must be set to 0 as this mode doesnt supported batched alpha vector.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE = 14,
+
+  /** Number of SMs to target for parallel execution. Optimizes heuristics for execution on a different number of SMs
+   *  when user expects a concurrent stream to be using some of the device resources.
+   *
+   *  int32_t, default: 0 - use the number reported by the device.
+   */
+  CUBLASLT_MATMUL_DESC_SM_COUNT_TARGET = 15,
+
+  /** Device pointer to the scale factor value that converts data in matrix A to the compute data type range.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_A_SCALE_POINTER = 17,
+
+  /** Device pointer to the scale factor value to convert data in matrix B to compute data type range.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_B_SCALE_POINTER = 18,
+
+  /** Device pointer to the scale factor value to convert data in matrix C to compute data type range.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_C_SCALE_POINTER = 19,
+
+  /** Device pointer to the scale factor value to convert data in matrix D to compute data type range.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_D_SCALE_POINTER = 20,
+
+  /** Device pointer to the memory location that on completion will be set to the maximum of absolute values in the
+   *  output matrix.
+   *
+   *  The computed value has the same type as the compute type.
+   *
+   *  If not specified or set to NULL, the maximum absolute value is not computed. If set for an unsupported matrix
+   *  data, scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
+   *
+   *  void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_AMAX_D_POINTER = 21,
+
+  /** Type of the data to be stored to the memory pointed to by CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   *
+   *  If unset, the data type defaults to the type of elements of the output matrix with some exceptions, see details
+   * below.
+   *
+   *  ReLu uses a bit-mask.
+   *
+   *  GELU input matrix elements type is the same as the type of elements of
+   *  the output matrix with some exceptions, see details below.
+   *
+   *  For fp8 kernels with output type CUDA_R_8F_E4M3 the aux data type can be CUDA_R_8F_E4M3 or CUDA_R_16F with some
+   *  restrictions.  See https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulDescAttributes_t for more details.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  int32_t based on cudaDataType, default: -1
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE = 22,
+
+  /** Device pointer to the scaling factor value to convert results from compute type data range to storage
+   *  data range in the auxiliary matrix that is set via CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1. If set for an unsupported matrix data,
+   *  scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
+   *
+   *  void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_POINTER = 23,
+
+  /** Device pointer to the memory location that on completion will be set to the maximum of absolute values in the
+   *  buffer that is set via CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   *
+   *  The computed value has the same type as the compute type.
+   *
+   *  If not specified or set to NULL, the maximum absolute value is not computed. If set for an unsupported matrix
+   *  data, scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
+   *
+   *  void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER = 24,
+
+  /** Flag for managing fp8 fast accumulation mode.
+   *  When enabled, problem execution might be faster but at the cost of lower accuracy because intermediate results
+   *  will not periodically be promoted to a higher precision.
+   *
+   *  int8_t, default: 0 - fast accumulation mode is disabled.
+   */
+  CUBLASLT_MATMUL_DESC_FAST_ACCUM = 25,
+
+  /** Type of bias or bias gradient vector in the device memory.
+   *
+   * Bias case: see CUBLASLT_EPILOGUE_BIAS.
+   *
+   * Bias vector elements are the same type as the elements of output matrix (Dtype) with the following exceptions:
+   * - IMMA kernels with computeType=CUDA_R_32I and Ctype=CUDA_R_8I where the bias vector elements
+   *   are the same type as alpha, beta (CUBLASLT_MATMUL_DESC_SCALE_TYPE=CUDA_R_32F)
+   * - fp8 kernels with an output type of CUDA_R_32F, CUDA_R_8F_E4M3 or CUDA_R_8F_E5M2, See
+   *   https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul for details.
+   *
+   * int32_t based on cudaDataType, default: -1
+   */
+  CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE = 26,
+
+  /** EXPERIMENTAL: Number of atomic synchronization chunks in the row dimension of the output matrix D.
+   *
+   * int32_t, default 0 (atomic synchronization disabled)
+   */
+  CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS = 27,
+
+  /** EXPERIMENTAL: Number of atomic synchronization chunks in the column dimension of the output matrix D.
+   *
+   * int32_t, default 0 (atomic synchronization disabled)
+   */
+  CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS = 28,
+
+  /** EXPERIMENTAL: Pointer to a device array of input atomic counters consumed by a matmul.
+   *
+   * int32_t *, default: NULL
+   * */
+  CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER = 29,
+
+  /** EXPERIMENTAL: Pointer to a device array of output atomic counters produced by a matmul.
+   *
+   * int32_t *, default: NULL
+   * */
+  CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER = 30,
+} cublasLtMatmulDescAttributes_t;
+
+/** Internal. Do not use directly.
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescInit_internal(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    size_t size,
+    cublasComputeType_t computeType,
+    cudaDataType_t scaleType);
+
+/** Initialize matmul operation descriptor in pre-allocated space.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was initialized successfully
+ */
+static inline cublasStatus_t cublasLtMatmulDescInit(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    cublasComputeType_t computeType,
+    cudaDataType_t scaleType) {
+  return cublasLtMatmulDescInit_internal(matmulDesc, sizeof(*matmulDesc), computeType, scaleType);
+}
+
+/** Create new matmul operation descriptor.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescCreate(cublasLtMatmulDesc_t* matmulDesc,
+                                                     cublasComputeType_t computeType,
+                                                     cudaDataType_t scaleType);
+
+/** Destroy matmul operation descriptor.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescDestroy(cublasLtMatmulDesc_t matmulDesc);
+
+/** Set matmul operation descriptor attribute.
+ *
+ * \param[in]  matmulDesc   The descriptor
+ * \param[in]  attr         The attribute
+ * \param[in]  buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescSetAttribute(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    cublasLtMatmulDescAttributes_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+
+/** Get matmul operation descriptor attribute.
+ *
+ * \param[in]  matmulDesc   The descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescGetAttribute(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    cublasLtMatmulDescAttributes_t attr,
+    void* buf,
+    size_t sizeInBytes,
+    size_t* sizeWritten);
+
+/* ---------------------------------------------------------------------------------------*/
+/* Helper functions for cublasLtMatrixTransformDesc_t */
+/* ---------------------------------------------------------------------------------------*/
+
+/** Matrix transform descriptor attributes to define details of the operation.
+ */
+typedef enum {
+  /** Scale type, see cudaDataType. Inputs are converted to scale type for scaling and summation and results are then
+   * converted to output type to store in memory.
+   *
+   * int32_t
+   */
+  CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
+
+  /** Pointer mode of alpha and beta, see cublasLtPointerMode_t.
+   *
+   * int32_t, default: CUBLASLT_POINTER_MODE_HOST
+   */
+  CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
+
+  /** Transform of matrix A, see cublasOperation_t.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA,
+
+  /** Transform of matrix B, see cublasOperation_t.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSB,
+} cublasLtMatrixTransformDescAttributes_t;
+
+/** Internal. Do not use directly.
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescInit_internal(cublasLtMatrixTransformDesc_t transformDesc,
+                                                                     size_t size,
+                                                                     cudaDataType scaleType);
+
+/** Initialize matrix transform operation descriptor in pre-allocated space.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+static inline cublasStatus_t cublasLtMatrixTransformDescInit(cublasLtMatrixTransformDesc_t transformDesc,
+                                                             cudaDataType scaleType) {
+  return cublasLtMatrixTransformDescInit_internal(transformDesc, sizeof(*transformDesc), scaleType);
+}
+
+/** Create new matrix transform operation descriptor.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescCreate(cublasLtMatrixTransformDesc_t* transformDesc,
+                                                              cudaDataType scaleType);
+
+/** Destroy matrix transform operation descriptor.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescDestroy(cublasLtMatrixTransformDesc_t transformDesc);
+
+/** Set matrix transform operation descriptor attribute.
+ *
+ * \param[in]  transformDesc  The descriptor
+ * \param[in]  attr           The attribute
+ * \param[in]  buf            memory address containing the new value
+ * \param[in]  sizeInBytes    size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescSetAttribute(  //
+    cublasLtMatrixTransformDesc_t transformDesc,
+    cublasLtMatrixTransformDescAttributes_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+
+/** Get matrix transform operation descriptor attribute.
+ *
+ * \param[in]  transformDesc  The descriptor
+ * \param[in]  attr           The attribute
+ * \param[out] buf            memory address containing the new value
+ * \param[in]  sizeInBytes    size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten    only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number
+ * of bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescGetAttribute(  //
+    cublasLtMatrixTransformDesc_t transformDesc,
+    cublasLtMatrixTransformDescAttributes_t attr,
+    void* buf,
+    size_t sizeInBytes,
+    size_t* sizeWritten);
+
+/** Reduction scheme for portions of the dot-product calculated in parallel (a. k. a. "split - K").
+ */
+typedef enum {
+  /** No reduction scheme, dot-product shall be performed in one sequence.
+   */
+  CUBLASLT_REDUCTION_SCHEME_NONE = 0,
+
+  /** Reduction is performed "in place" - using the output buffer (and output data type) and counters (in workspace) to
+   * guarantee the sequentiality.
+   */
+  CUBLASLT_REDUCTION_SCHEME_INPLACE = 1,
+
+  /** Intermediate results are stored in compute type in the workspace and reduced in a separate step.
+   */
+  CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE = 2,
+
+  /** Intermediate results are stored in output type in the workspace and reduced in a separate step.
+   */
+  CUBLASLT_REDUCTION_SCHEME_OUTPUT_TYPE = 4,
+
+  CUBLASLT_REDUCTION_SCHEME_MASK = 0x7,
+} cublasLtReductionScheme_t;
+
+/** Postprocessing options for the epilogue
+ */
+typedef enum {
+  /** No special postprocessing, just scale and quantize results if necessary.
+   */
+  CUBLASLT_EPILOGUE_DEFAULT = 1,
+
+  /** ReLu, apply ReLu point-wise transform to the results (x:=max(x, 0)).
+   */
+  CUBLASLT_EPILOGUE_RELU = 2,
+
+  /** ReLu, apply ReLu point-wise transform to the results (x:=max(x, 0)).
+   *
+   * This epilogue mode produces an extra output, a ReLu bit-mask matrix,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_RELU_AUX = (CUBLASLT_EPILOGUE_RELU | 128),
+
+  /** Bias, apply (broadcasted) Bias from bias vector. Bias vector length must match matrix D rows, it must be packed
+   * (stride between vector elements is 1). Bias vector is broadcasted to all columns and added before applying final
+   * postprocessing.
+   */
+  CUBLASLT_EPILOGUE_BIAS = 4,
+
+  /** ReLu and Bias, apply Bias and then ReLu transform
+   */
+  CUBLASLT_EPILOGUE_RELU_BIAS = (CUBLASLT_EPILOGUE_RELU | CUBLASLT_EPILOGUE_BIAS),
+
+  /** ReLu and Bias, apply Bias and then ReLu transform
+   *
+   * This epilogue mode produces an extra output, a ReLu bit-mask matrix,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_RELU_AUX_BIAS = (CUBLASLT_EPILOGUE_RELU_AUX | CUBLASLT_EPILOGUE_BIAS),
+
+  /* ReLu gradient. Apply ReLu gradient to matmul output. Store ReLu gradient in the output matrix.
+   *
+   * This epilogue mode requires an extra input,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_DRELU = 8 | 128,
+
+  /* ReLu and Bias gradients. Apply independently ReLu and Bias gradient to
+   * matmul output. Store ReLu gradient in the output matrix, and Bias gradient
+   * in the auxiliary output (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
+   *
+   * This epilogue mode requires an extra input,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_DRELU_BGRAD = CUBLASLT_EPILOGUE_DRELU | 16,
+
+  /** GELU, apply GELU point-wise transform to the results (x:=GELU(x)).
+   */
+  CUBLASLT_EPILOGUE_GELU = 32,
+
+  /** GELU, apply GELU point-wise transform to the results (x:=GELU(x)).
+   *
+   * This epilogue mode outputs GELU input as a separate matrix (useful for training).
+   * See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_GELU_AUX = (CUBLASLT_EPILOGUE_GELU | 128),
+
+  /** GELU and Bias, apply Bias and then GELU transform
+   */
+  CUBLASLT_EPILOGUE_GELU_BIAS = (CUBLASLT_EPILOGUE_GELU | CUBLASLT_EPILOGUE_BIAS),
+
+  /** GELU and Bias, apply Bias and then GELU transform
+   *
+   * This epilogue mode outputs GELU input as a separate matrix (useful for training).
+   * See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_GELU_AUX_BIAS = (CUBLASLT_EPILOGUE_GELU_AUX | CUBLASLT_EPILOGUE_BIAS),
+
+  /* GELU gradient. Apply GELU gradient to matmul output. Store GELU gradient in the output matrix.
+   *
+   * This epilogue mode requires an extra input,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_DGELU = 64 | 128,
+
+  /* GELU and Bias gradients. Apply independently GELU and Bias gradient to
+   * matmul output. Store GELU gradient in the output matrix, and Bias gradient
+   * in the auxiliary output (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
+   *
+   * This epilogue mode requires an extra input,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_DGELU_BGRAD = CUBLASLT_EPILOGUE_DGELU | 16,
+
+  /** Bias gradient based on the input matrix A.
+   *
+   * The bias size corresponds to the number of rows of the matrix D.
+   * The reduction happens over the GEMM's "k" dimension.
+   *
+   * Stores Bias gradient in the auxiliary output
+   * (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
+   */
+  CUBLASLT_EPILOGUE_BGRADA = 256,
+
+  /** Bias gradient based on the input matrix B.
+   *
+   * The bias size corresponds to the number of columns of the matrix D.
+   * The reduction happens over the GEMM's "k" dimension.
+   *
+   * Stores Bias gradient in the auxiliary output
+   * (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
+   */
+  CUBLASLT_EPILOGUE_BGRADB = 512,
+} cublasLtEpilogue_t;
+
+/** Matmul heuristic search mode
+ */
+typedef enum {
+  /** ask heuristics for best algo for given usecase
+   */
+  CUBLASLT_SEARCH_BEST_FIT = 0,
+  /** only try to find best config for preconfigured algo id
+   */
+  CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID = 1,
+  /** reserved for future use
+   */
+  CUBLASLT_SEARCH_RESERVED_02 = 2,
+  /** reserved for future use
+   */
+  CUBLASLT_SEARCH_RESERVED_03 = 3,
+  /** reserved for future use
+   */
+  CUBLASLT_SEARCH_RESERVED_04 = 4,
+  /** reserved for future use
+   */
+  CUBLASLT_SEARCH_RESERVED_05 = 5,
+} cublasLtMatmulSearch_t;
+
+/** Algo search preference to fine tune the heuristic function. */
+typedef enum {
+  /** Search mode, see cublasLtMatmulSearch_t.
+   *
+   * uint32_t, default: CUBLASLT_SEARCH_BEST_FIT
+   */
+  CUBLASLT_MATMUL_PREF_SEARCH_MODE = 0,
+
+  /** Maximum allowed workspace size in bytes.
+   *
+   * uint64_t, default: 0 - no workspace allowed
+   */
+  CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES = 1,
+
+  /** Reduction scheme mask, see cublasLtReductionScheme_t. Filters heuristic result to only include algo configs that
+   * use one of the required modes.
+   *
+   * E.g. mask value of 0x03 will allow only INPLACE and COMPUTE_TYPE reduction schemes.
+   *
+   * uint32_t, default: CUBLASLT_REDUCTION_SCHEME_MASK (allows all reduction schemes)
+   */
+  CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK = 3,
+
+  /** Minimum buffer alignment for matrix A (in bytes).
+   *
+   * Selecting a smaller value will exclude algorithms that can not work with matrix A that is not as strictly aligned
+   * as they need.
+   *
+   * uint32_t, default: 256
+   */
+  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES = 5,
+
+  /** Minimum buffer alignment for matrix B (in bytes).
+   *
+   * Selecting a smaller value will exclude algorithms that can not work with matrix B that is not as strictly aligned
+   * as they need.
+   *
+   * uint32_t, default: 256
+   */
+  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES = 6,
+
+  /** Minimum buffer alignment for matrix C (in bytes).
+   *
+   * Selecting a smaller value will exclude algorithms that can not work with matrix C that is not as strictly aligned
+   * as they need.
+   *
+   * uint32_t, default: 256
+   */
+  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES = 7,
+
+  /** Minimum buffer alignment for matrix D (in bytes).
+   *
+   * Selecting a smaller value will exclude algorithms that can not work with matrix D that is not as strictly aligned
+   * as they need.
+   *
+   * uint32_t, default: 256
+   */
+  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES = 8,
+
+  /** Maximum wave count.
+   *
+   * See cublasLtMatmulHeuristicResult_t::wavesCount.
+   *
+   * Selecting a non-zero value will exclude algorithms that report device utilization higher than specified.
+   *
+   * float, default: 0.0f
+   */
+  CUBLASLT_MATMUL_PREF_MAX_WAVES_COUNT = 9,
+
+  /** Numerical implementation details mask, see cublasLtNumericalImplFlags_t. Filters heuristic result to only include
+   * algorithms that use the allowed implementations.
+   *
+   * uint64_t, default: uint64_t(-1) (allow everything)
+   */
+  CUBLASLT_MATMUL_PREF_IMPL_MASK = 12,
+} cublasLtMatmulPreferenceAttributes_t;
+
+/** Internal. Do not use directly.
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceInit_internal(cublasLtMatmulPreference_t pref, size_t size);
+
+/** Initialize matmul heuristic search preference descriptor in pre-allocated space.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+static inline cublasStatus_t cublasLtMatmulPreferenceInit(cublasLtMatmulPreference_t pref) {
+  return cublasLtMatmulPreferenceInit_internal(pref, sizeof(*pref));
+}
+
+/** Create new matmul heuristic search preference descriptor.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceCreate(cublasLtMatmulPreference_t* pref);
+
+/** Destroy matmul heuristic search preference descriptor.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceDestroy(cublasLtMatmulPreference_t pref);
+
+/** Set matmul heuristic search preference descriptor attribute.
+ *
+ * \param[in]  pref         The descriptor
+ * \param[in]  attr         The attribute
+ * \param[in]  buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceSetAttribute(  //
+    cublasLtMatmulPreference_t pref,
+    cublasLtMatmulPreferenceAttributes_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+
+/** Get matmul heuristic search preference descriptor attribute.
+ *
+ * \param[in]  pref         The descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceGetAttribute(  //
+    cublasLtMatmulPreference_t pref,
+    cublasLtMatmulPreferenceAttributes_t attr,
+    void* buf,
+    size_t sizeInBytes,
+    size_t* sizeWritten);
+
+/** Results structure used by cublasLtMatmulGetAlgo.
+ *
+ * Holds returned configured algo descriptor and its runtime properties.
+ */
+typedef struct {
+  /** Matmul algorithm descriptor.
+   *
+   * Must be initialized with cublasLtMatmulAlgoInit() if preferences' CUBLASLT_MATMUL_PERF_SEARCH_MODE is set to
+   * CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID
+   */
+  cublasLtMatmulAlgo_t algo;
+
+  /** Actual size of workspace memory required.
+   */
+  size_t workspaceSize;
+
+  /** Result status, other fields are only valid if after call to cublasLtMatmulAlgoGetHeuristic() this member is set to
+   * CUBLAS_STATUS_SUCCESS.
+   */
+  cublasStatus_t state;
+
+  /** Waves count - a device utilization metric.
+   *
+   * wavesCount value of 1.0f suggests that when kernel is launched it will fully occupy the GPU.
+   */
+  float wavesCount;
+
+  int reserved[4];
+} cublasLtMatmulHeuristicResult_t;
+
+/** Query cublasLt heuristic for algorithm appropriate for given use case.
+ *
+ * \param[in]      lightHandle            Pointer to the allocated cuBLASLt handle for the cuBLASLt
+ *                                        context. See cublasLtHandle_t.
+ * \param[in]      operationDesc          Handle to the matrix multiplication descriptor.
+ * \param[in]      Adesc                  Handle to the layout descriptors for matrix A.
+ * \param[in]      Bdesc                  Handle to the layout descriptors for matrix B.
+ * \param[in]      Cdesc                  Handle to the layout descriptors for matrix C.
+ * \param[in]      Ddesc                  Handle to the layout descriptors for matrix D.
+ * \param[in]      preference             Pointer to the structure holding the heuristic search
+ *                                        preferences descriptor. See cublasLtMatrixLayout_t.
+ * \param[in]      requestedAlgoCount     Size of heuristicResultsArray (in elements) and requested
+ *                                        maximum number of algorithms to return.
+ * \param[in, out] heuristicResultsArray  Output algorithms and associated runtime characteristics,
+ *                                        ordered in increasing estimated compute time.
+ * \param[out]     returnAlgoCount        The number of heuristicResultsArray elements written.
+ *
+ * \retval  CUBLAS_STATUS_INVALID_VALUE   if requestedAlgoCount is less or equal to zero
+ * \retval  CUBLAS_STATUS_NOT_SUPPORTED   if no heuristic function available for current configuration
+ * \retval  CUBLAS_STATUS_SUCCESS         if query was successful, inspect
+ *                                        heuristicResultsArray[0 to (returnAlgoCount - 1)].state
+ *                                        for detail status of results
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetHeuristic(cublasLtHandle_t lightHandle,
+                                                           cublasLtMatmulDesc_t operationDesc,
+                                                           cublasLtMatrixLayout_t Adesc,
+                                                           cublasLtMatrixLayout_t Bdesc,
+                                                           cublasLtMatrixLayout_t Cdesc,
+                                                           cublasLtMatrixLayout_t Ddesc,
+                                                           cublasLtMatmulPreference_t preference,
+                                                           int requestedAlgoCount,
+                                                           cublasLtMatmulHeuristicResult_t heuristicResultsArray[],
+                                                           int* returnAlgoCount);
+
+/* ---------------------------------------------------------------------------------------*/
+/* Lower level API to be able to implement own Heuristic and Find routines                */
+/* ---------------------------------------------------------------------------------------*/
+
+/** Routine to get all algo IDs that can potentially run
+ *
+ * \param[in]  int              requestedAlgoCount requested number of algos (must be less or equal to size of algoIdsA
+ * (in elements)) \param[out] algoIdsA         array to write algoIds to \param[out] returnAlgoCount  number of algoIds
+ * actually written
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if requestedAlgoCount is less or equal to zero
+ * \retval     CUBLAS_STATUS_SUCCESS        if query was successful, inspect returnAlgoCount to get actual number of IDs
+ *                                          available
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetIds(cublasLtHandle_t lightHandle,
+                                                     cublasComputeType_t computeType,
+                                                     cudaDataType_t scaleType,
+                                                     cudaDataType_t Atype,
+                                                     cudaDataType_t Btype,
+                                                     cudaDataType_t Ctype,
+                                                     cudaDataType_t Dtype,
+                                                     int requestedAlgoCount,
+                                                     int algoIdsArray[],
+                                                     int* returnAlgoCount);
+
+/** Initialize algo structure
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if algo is NULL or algoId is outside of recognized range
+ * \retval     CUBLAS_STATUS_NOT_SUPPORTED  if algoId is not supported for given combination of data types
+ * \retval     CUBLAS_STATUS_SUCCESS        if the structure was successfully initialized
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoInit(cublasLtHandle_t lightHandle,
+                                                   cublasComputeType_t computeType,
+                                                   cudaDataType_t scaleType,
+                                                   cudaDataType_t Atype,
+                                                   cudaDataType_t Btype,
+                                                   cudaDataType_t Ctype,
+                                                   cudaDataType_t Dtype,
+                                                   int algoId,
+                                                   cublasLtMatmulAlgo_t* algo);
+
+/** Check configured algo descriptor for correctness and support on current device.
+ *
+ * Result includes required workspace size and calculated wave count.
+ *
+ * CUBLAS_STATUS_SUCCESS doesn't fully guarantee algo will run (will fail if e.g. buffers are not correctly aligned);
+ * but if cublasLtMatmulAlgoCheck fails, the algo will not run.
+ *
+ * \param[in]  algo    algo configuration to check
+ * \param[out] result  result structure to report algo runtime characteristics; algo field is never updated
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if matrix layout descriptors or operation descriptor don't match algo
+ *                                          descriptor
+ * \retval     CUBLAS_STATUS_NOT_SUPPORTED  if algo configuration or data type combination is not currently supported on
+ *                                          given device
+ * \retval     CUBLAS_STATUS_ARCH_MISMATCH  if algo configuration cannot be run using the selected device
+ * \retval     CUBLAS_STATUS_SUCCESS        if check was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCheck(  //
+    cublasLtHandle_t lightHandle,
+    cublasLtMatmulDesc_t operationDesc,
+    cublasLtMatrixLayout_t Adesc,
+    cublasLtMatrixLayout_t Bdesc,
+    cublasLtMatrixLayout_t Cdesc,
+    cublasLtMatrixLayout_t Ddesc,
+    const cublasLtMatmulAlgo_t* algo,  ///< may point to result->algo
+    cublasLtMatmulHeuristicResult_t* result);
+
+/** Capabilities Attributes that can be retrieved from an initialized Algo structure
+ */
+typedef enum {
+  /** support for split K, see CUBLASLT_ALGO_CONFIG_SPLITK_NUM
+   *
+   * int32_t, 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_SPLITK_SUPPORT = 0,
+
+  /** reduction scheme mask, see cublasLtReductionScheme_t; shows supported reduction schemes, if reduction scheme is
+   * not masked out it is supported.
+   *
+   * e.g. int isReductionSchemeComputeTypeSupported ? (reductionSchemeMask & CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE) ==
+   * CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE ? 1 : 0;
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK = 1,
+
+  /** support for cta swizzling, see CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
+   *
+   * uint32_t, 0 means no support, 1 means supported value of 1, other values are reserved
+   */
+  CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT = 2,
+
+  /** support strided batch
+   *
+   * int32_t, 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT = 3,
+
+  /** support results out of place (D != C in D = alpha.A.B + beta.C)
+   *
+   * int32_t, 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_OUT_OF_PLACE_RESULT_SUPPORT = 4,
+
+  /** syrk/herk support (on top of regular gemm)
+   *
+   * int32_t, 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_UPLO_SUPPORT = 5,
+
+  /** tile ids possible to use, see cublasLtMatmulTile_t; if no tile ids are supported use
+   * CUBLASLT_MATMUL_TILE_UNDEFINED
+   *
+   * use cublasLtMatmulAlgoCapGetAttribute() with sizeInBytes=0 to query actual count
+   *
+   * array of uint32_t
+   */
+  CUBLASLT_ALGO_CAP_TILE_IDS = 6,
+
+  /** custom option range is from 0 to CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX (inclusive), see
+   * CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION
+   *
+   * int32_t
+   */
+  CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX = 7,
+
+  /** whether algorithm supports custom (not COL or ROW memory order), see cublasLtOrder_t
+   *
+   * int32_t 0 means only COL and ROW memory order is allowed, non-zero means that algo might have different
+   * requirements;
+   */
+  CUBLASLT_ALGO_CAP_CUSTOM_MEMORY_ORDER = 10,
+
+  /** bitmask enumerating pointer modes algorithm supports
+   *
+   * uint32_t, see cublasLtPointerModeMask_t
+   */
+  CUBLASLT_ALGO_CAP_POINTER_MODE_MASK = 11,
+
+  /** bitmask enumerating kinds of postprocessing algorithm supports in the epilogue
+   *
+   * uint32_t, see cublasLtEpilogue_t
+   */
+  CUBLASLT_ALGO_CAP_EPILOGUE_MASK = 12,
+
+  /** stages ids possible to use, see cublasLtMatmulStages_t; if no stages ids are supported use
+   * CUBLASLT_MATMUL_STAGES_UNDEFINED
+   *
+   * use cublasLtMatmulAlgoCapGetAttribute() with sizeInBytes=0 to query actual count
+   *
+   * array of uint32_t
+   */
+  CUBLASLT_ALGO_CAP_STAGES_IDS = 13,
+
+  /** support for nagative ld for all of the matrices
+   *
+   * int32_t 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_LD_NEGATIVE = 14,
+
+  /** details about algorithm's implementation that affect it's numerical behavior
+   *
+   * uint64_t, see cublasLtNumericalImplFlags_t
+   */
+  CUBLASLT_ALGO_CAP_NUMERICAL_IMPL_FLAGS = 15,
+
+  /** minimum alignment required for A matrix in bytes
+   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_A_BYTES = 16,
+
+  /** minimum alignment required for B matrix in bytes
+   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES = 17,
+
+  /** minimum alignment required for C matrix in bytes
+   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES = 18,
+
+  /** minimum alignment required for D matrix in bytes
+   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES = 19,
+
+  /** EXPERIMENTAL: support for synchronization via atomic counters
+   *
+   * int32_t
+   */
+  CUBLASLT_ALGO_CAP_ATOMIC_SYNC = 20,
+} cublasLtMatmulAlgoCapAttributes_t;
+
+/** Get algo capability attribute.
+ *
+ * E.g. to get list of supported Tile IDs:
+ *      cublasLtMatmulTile_t tiles[CUBLASLT_MATMUL_TILE_END];
+ *      size_t num_tiles, size_written;
+ *      if (cublasLtMatmulAlgoCapGetAttribute(algo, CUBLASLT_ALGO_CAP_TILE_IDS, tiles, sizeof(tiles), size_written) ==
+ * CUBLAS_STATUS_SUCCESS) { num_tiles = size_written / sizeof(tiles[0]);
+ *      }
+ *
+ * \param[in]  algo         The algo descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCapGetAttribute(const cublasLtMatmulAlgo_t* algo,
+                                                              cublasLtMatmulAlgoCapAttributes_t attr,
+                                                              void* buf,
+                                                              size_t sizeInBytes,
+                                                              size_t* sizeWritten);
+
+/** Algo Configuration Attributes that can be set according to the Algo capabilities
+ */
+typedef enum {
+  /** algorithm index, see cublasLtMatmulAlgoGetIds()
+   *
+   * readonly, set by cublasLtMatmulAlgoInit()
+   * int32_t
+   */
+  CUBLASLT_ALGO_CONFIG_ID = 0,
+  /** tile id, see cublasLtMatmulTile_t
+   *
+   * uint32_t, default: CUBLASLT_MATMUL_TILE_UNDEFINED
+   */
+  CUBLASLT_ALGO_CONFIG_TILE_ID = 1,
+  /** Number of K splits. If the number of K splits is greater than one, SPLITK_NUM parts
+   * of matrix multiplication will be computed in parallel. The results will be accumulated
+   * according to CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
+   *
+   * int32_t, default: 1
+   */
+  CUBLASLT_ALGO_CONFIG_SPLITK_NUM = 2,
+  /** reduction scheme, see cublasLtReductionScheme_t
+   *
+   * uint32_t, default: CUBLASLT_REDUCTION_SCHEME_NONE
+   */
+  CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME = 3,
+  /** cta swizzling, change mapping from CUDA grid coordinates to parts of the matrices
+   *
+   * possible values: 0, 1, other values reserved
+   *
+   * uint32_t, default: 0
+   */
+  CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING = 4,
+  /** custom option, each algorithm can support some custom options that don't fit description of the other config
+   * attributes, see CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX to get accepted range for any specific case
+   *
+   * uint32_t, default: 0
+   */
+  CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION = 5,
+  /** stages id, see cublasLtMatmulStages_t
+   *
+   * uint32_t, default: CUBLASLT_MATMUL_STAGES_UNDEFINED
+   */
+  CUBLASLT_ALGO_CONFIG_STAGES_ID = 6,
+  /** inner shape id, see cublasLtMatmulInnerShape_t
+   *
+   * uint16_t, default: 0 (CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED)
+   */
+  CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID = 7,
+  /** Thread Block Cluster shape id, see cublasLtClusterShape_t. Defines cluster size to use.
+   *
+   * uint16_t, default: 0 (CUBLASLT_CLUSTER_SHAPE_AUTO)
+   */
+  CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID = 8,
+} cublasLtMatmulAlgoConfigAttributes_t;
+
+/** Set algo configuration attribute.
+ *
+ * \param[in]  algo         The algo descriptor
+ * \param[in]  attr         The attribute
+ * \param[in]  buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigSetAttribute(cublasLtMatmulAlgo_t* algo,
+                                                                 cublasLtMatmulAlgoConfigAttributes_t attr,
+                                                                 const void* buf,
+                                                                 size_t sizeInBytes);
+
+/** Get algo configuration attribute.
+ *
+ * \param[in]  algo         The algo descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigGetAttribute(const cublasLtMatmulAlgo_t* algo,
+                                                                 cublasLtMatmulAlgoConfigAttributes_t attr,
+                                                                 void* buf,
+                                                                 size_t sizeInBytes,
+                                                                 size_t* sizeWritten);
+
+/** Experimental: Logger callback type.
+ */
+typedef void (*cublasLtLoggerCallback_t)(int logLevel, const char* functionName, const char* message);
+
+/** Experimental: Logger callback setter.
+ *
+ * \param[in]  callback                     a user defined callback function to be called by the logger
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if callback was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerSetCallback(cublasLtLoggerCallback_t callback);
+
+/** Experimental: Log file setter.
+ *
+ * \param[in]  file                         an open file with write permissions
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if log file was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerSetFile(FILE* file);
+
+/** Experimental: Open log file.
+ *
+ * \param[in]  logFile                      log file path. if the log file does not exist, it will be created
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if log file was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerOpenFile(const char* logFile);
+
+/** Experimental: Log level setter.
+ *
+ * \param[in]  level                        log level, should be one of the following:
+ *                                          0. Off
+ *                                          1. Errors
+ *                                          2. Performance Trace
+ *                                          3. Performance Hints
+ *                                          4. Heuristics Trace
+ *                                          5. API Trace
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if log level is not one of the above levels
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if log level was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerSetLevel(int level);
+
+/** Experimental: Log mask setter.
+ *
+ * \param[in]  mask                         log mask, should be a combination of the following masks:
+ *                                          0.  Off
+ *                                          1.  Errors
+ *                                          2.  Performance Trace
+ *                                          4.  Performance Hints
+ *                                          8.  Heuristics Trace
+ *                                          16. API Trace
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if log mask was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerSetMask(int mask);
+
+/** Experimental: Disable logging for the entire session.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if disabled logging
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerForceDisable();
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe0e6f99b952514874c45208e751f5330e71570c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h
@@ -0,0 +1,693 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*   cublasXt : Host API, Out of Core and Multi-GPU BLAS Library
+
+*/
+
+#if !defined(CUBLAS_XT_H_)
+#define CUBLAS_XT_H_
+
+#include "driver_types.h"
+#include "cuComplex.h" /* import complex data type */
+
+#include "cublas_v2.h"
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+struct cublasXtContext;
+typedef struct cublasXtContext* cublasXtHandle_t;
+
+cublasStatus_t CUBLASWINAPI cublasXtCreate(cublasXtHandle_t* handle);
+cublasStatus_t CUBLASWINAPI cublasXtDestroy(cublasXtHandle_t handle);
+cublasStatus_t CUBLASWINAPI cublasXtGetNumBoards(int nbDevices, int deviceId[], int* nbBoards);
+cublasStatus_t CUBLASWINAPI cublasXtMaxBoards(int* nbGpuBoards);
+/* This routine selects the Gpus that the user want to use for CUBLAS-XT */
+cublasStatus_t CUBLASWINAPI cublasXtDeviceSelect(cublasXtHandle_t handle, int nbDevices, int deviceId[]);
+
+/* This routine allows to change the dimension of the tiles ( blockDim x blockDim ) */
+cublasStatus_t CUBLASWINAPI cublasXtSetBlockDim(cublasXtHandle_t handle, int blockDim);
+cublasStatus_t CUBLASWINAPI cublasXtGetBlockDim(cublasXtHandle_t handle, int* blockDim);
+
+typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t;
+/* This routine allows to CUBLAS-XT to pin the Host memory if it find out that some of the matrix passed
+   are not pinned : Pinning/Unpinning the Host memory is still a costly operation
+   It is better if the user controls the memory on its own (by pinning/unpinning oly when necessary)
+*/
+cublasStatus_t CUBLASWINAPI cublasXtGetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t* mode);
+cublasStatus_t CUBLASWINAPI cublasXtSetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t mode);
+
+/* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */
+typedef enum {
+  CUBLASXT_FLOAT = 0,
+  CUBLASXT_DOUBLE = 1,
+  CUBLASXT_COMPLEX = 2,
+  CUBLASXT_DOUBLECOMPLEX = 3,
+} cublasXtOpType_t;
+
+typedef enum {
+  CUBLASXT_GEMM = 0,
+  CUBLASXT_SYRK = 1,
+  CUBLASXT_HERK = 2,
+  CUBLASXT_SYMM = 3,
+  CUBLASXT_HEMM = 4,
+  CUBLASXT_TRSM = 5,
+  CUBLASXT_SYR2K = 6,
+  CUBLASXT_HER2K = 7,
+
+  CUBLASXT_SPMM = 8,
+  CUBLASXT_SYRKX = 9,
+  CUBLASXT_HERKX = 10,
+  CUBLASXT_TRMM = 11,
+  CUBLASXT_ROUTINE_MAX = 12,
+} cublasXtBlasOp_t;
+
+/* Currently only 32-bit integer BLAS routines are supported */
+cublasStatus_t CUBLASWINAPI cublasXtSetCpuRoutine(cublasXtHandle_t handle,
+                                                  cublasXtBlasOp_t blasOp,
+                                                  cublasXtOpType_t type,
+                                                  void* blasFunctor);
+
+/* Specified the percentage of work that should done by the CPU, default is 0 (no work) */
+cublasStatus_t CUBLASWINAPI cublasXtSetCpuRatio(cublasXtHandle_t handle,
+                                                cublasXtBlasOp_t blasOp,
+                                                cublasXtOpType_t type,
+                                                float ratio);
+
+/* GEMM */
+cublasStatus_t CUBLASWINAPI cublasXtSgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* B,
+                                          size_t ldb,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* B,
+                                          size_t ldb,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* ------------------------------------------------------- */
+/* SYRK */
+cublasStatus_t CUBLASWINAPI cublasXtSsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HERK */
+cublasStatus_t CUBLASWINAPI cublasXtCherk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const float* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const float* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZherk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const double* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const double* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* SYR2K */
+cublasStatus_t CUBLASWINAPI cublasXtSsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const float* alpha,
+                                           const float* A,
+                                           size_t lda,
+                                           const float* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           float* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const double* alpha,
+                                           const double* A,
+                                           size_t lda,
+                                           const double* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           double* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const cuComplex* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const cuDoubleComplex* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HERKX : variant extension of HERK */
+cublasStatus_t CUBLASWINAPI cublasXtCherkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZherkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+
+/* -------------------------------------------------------------------- */
+/* TRSM */
+cublasStatus_t CUBLASWINAPI cublasXtStrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          float* B,
+                                          size_t ldb);
+
+cublasStatus_t CUBLASWINAPI cublasXtDtrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          double* B,
+                                          size_t ldb);
+
+cublasStatus_t CUBLASWINAPI cublasXtCtrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          cuComplex* B,
+                                          size_t ldb);
+
+cublasStatus_t CUBLASWINAPI cublasXtZtrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          cuDoubleComplex* B,
+                                          size_t ldb);
+/* -------------------------------------------------------------------- */
+/* SYMM : Symmetric Multiply Matrix*/
+cublasStatus_t CUBLASWINAPI cublasXtSsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* B,
+                                          size_t ldb,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* B,
+                                          size_t ldb,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HEMM : Hermitian Matrix Multiply */
+cublasStatus_t CUBLASWINAPI cublasXtChemm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZhemm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+
+/* -------------------------------------------------------------------- */
+/* SYRKX : variant extension of SYRK  */
+cublasStatus_t CUBLASWINAPI cublasXtSsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const float* alpha,
+                                           const float* A,
+                                           size_t lda,
+                                           const float* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           float* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const double* alpha,
+                                           const double* A,
+                                           size_t lda,
+                                           const double* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           double* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const cuComplex* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const cuDoubleComplex* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HER2K : variant extension of HERK  */
+cublasStatus_t CUBLASWINAPI cublasXtCher2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZher2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+
+/* -------------------------------------------------------------------- */
+/* SPMM : Symmetric Packed Multiply Matrix*/
+cublasStatus_t CUBLASWINAPI cublasXtSspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* AP,
+                                          const float* B,
+                                          size_t ldb,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* AP,
+                                          const double* B,
+                                          size_t ldb,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* AP,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* AP,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+
+/* -------------------------------------------------------------------- */
+/* TRMM */
+cublasStatus_t CUBLASWINAPI cublasXtStrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* B,
+                                          size_t ldb,
+                                          float* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtDtrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* B,
+                                          size_t ldb,
+                                          double* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtCtrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          cuComplex* C,
+                                          size_t ldc);
+
+cublasStatus_t CUBLASWINAPI cublasXtZtrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif /* !defined(CUBLAS_XT_H_) */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h
new file mode 100644
index 0000000000000000000000000000000000000000..b46b3c41386e3740ae99c246d0fdc606111d8593
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h
@@ -0,0 +1,5793 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ * This is the public header file for the CUBLAS library, defining the API
+ *
+ * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
+ * on top of the CUDA runtime.
+ */
+
+#if !defined(CUBLAS_API_H_)
+#define CUBLAS_API_H_
+
+#ifndef CUBLASWINAPI
+#ifdef _WIN32
+#define CUBLASWINAPI __stdcall
+#else
+#define CUBLASWINAPI
+#endif
+#endif
+
+#ifndef CUBLASAPI
+#error "This file should not be included without defining CUBLASAPI"
+#endif
+
+#include <stdint.h>
+
+#include "driver_types.h"
+#include "cuComplex.h" /* import complex data type */
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+#include <library_types.h>
+
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+
+#define CUBLAS_VER_MAJOR 12
+#define CUBLAS_VER_MINOR 4
+#define CUBLAS_VER_PATCH 5
+#define CUBLAS_VER_BUILD 8
+#define CUBLAS_VERSION (CUBLAS_VER_MAJOR * 10000 + CUBLAS_VER_MINOR * 100 + CUBLAS_VER_PATCH)
+
+/* CUBLAS status type returns */
+typedef enum {
+  CUBLAS_STATUS_SUCCESS = 0,
+  CUBLAS_STATUS_NOT_INITIALIZED = 1,
+  CUBLAS_STATUS_ALLOC_FAILED = 3,
+  CUBLAS_STATUS_INVALID_VALUE = 7,
+  CUBLAS_STATUS_ARCH_MISMATCH = 8,
+  CUBLAS_STATUS_MAPPING_ERROR = 11,
+  CUBLAS_STATUS_EXECUTION_FAILED = 13,
+  CUBLAS_STATUS_INTERNAL_ERROR = 14,
+  CUBLAS_STATUS_NOT_SUPPORTED = 15,
+  CUBLAS_STATUS_LICENSE_ERROR = 16
+} cublasStatus_t;
+
+typedef enum { CUBLAS_FILL_MODE_LOWER = 0, CUBLAS_FILL_MODE_UPPER = 1, CUBLAS_FILL_MODE_FULL = 2 } cublasFillMode_t;
+
+typedef enum { CUBLAS_DIAG_NON_UNIT = 0, CUBLAS_DIAG_UNIT = 1 } cublasDiagType_t;
+
+typedef enum { CUBLAS_SIDE_LEFT = 0, CUBLAS_SIDE_RIGHT = 1 } cublasSideMode_t;
+
+typedef enum {
+  CUBLAS_OP_N = 0,
+  CUBLAS_OP_T = 1,
+  CUBLAS_OP_C = 2,
+  CUBLAS_OP_HERMITAN = 2, /* synonym if CUBLAS_OP_C */
+  CUBLAS_OP_CONJG = 3     /* conjugate, placeholder - not supported in the current release */
+} cublasOperation_t;
+
+typedef enum { CUBLAS_POINTER_MODE_HOST = 0, CUBLAS_POINTER_MODE_DEVICE = 1 } cublasPointerMode_t;
+
+typedef enum { CUBLAS_ATOMICS_NOT_ALLOWED = 0, CUBLAS_ATOMICS_ALLOWED = 1 } cublasAtomicsMode_t;
+
+/*For different GEMM algorithm */
+typedef enum {
+  CUBLAS_GEMM_DFALT = -1,
+  CUBLAS_GEMM_DEFAULT = -1,
+  CUBLAS_GEMM_ALGO0 = 0,
+  CUBLAS_GEMM_ALGO1 = 1,
+  CUBLAS_GEMM_ALGO2 = 2,
+  CUBLAS_GEMM_ALGO3 = 3,
+  CUBLAS_GEMM_ALGO4 = 4,
+  CUBLAS_GEMM_ALGO5 = 5,
+  CUBLAS_GEMM_ALGO6 = 6,
+  CUBLAS_GEMM_ALGO7 = 7,
+  CUBLAS_GEMM_ALGO8 = 8,
+  CUBLAS_GEMM_ALGO9 = 9,
+  CUBLAS_GEMM_ALGO10 = 10,
+  CUBLAS_GEMM_ALGO11 = 11,
+  CUBLAS_GEMM_ALGO12 = 12,
+  CUBLAS_GEMM_ALGO13 = 13,
+  CUBLAS_GEMM_ALGO14 = 14,
+  CUBLAS_GEMM_ALGO15 = 15,
+  CUBLAS_GEMM_ALGO16 = 16,
+  CUBLAS_GEMM_ALGO17 = 17,
+  CUBLAS_GEMM_ALGO18 = 18,  // sliced 32x32
+  CUBLAS_GEMM_ALGO19 = 19,  // sliced 64x32
+  CUBLAS_GEMM_ALGO20 = 20,  // sliced 128x32
+  CUBLAS_GEMM_ALGO21 = 21,  // sliced 32x32  -splitK
+  CUBLAS_GEMM_ALGO22 = 22,  // sliced 64x32  -splitK
+  CUBLAS_GEMM_ALGO23 = 23,  // sliced 128x32 -splitK
+  CUBLAS_GEMM_DEFAULT_TENSOR_OP = 99,
+  CUBLAS_GEMM_DFALT_TENSOR_OP = 99,
+  CUBLAS_GEMM_ALGO0_TENSOR_OP = 100,
+  CUBLAS_GEMM_ALGO1_TENSOR_OP = 101,
+  CUBLAS_GEMM_ALGO2_TENSOR_OP = 102,
+  CUBLAS_GEMM_ALGO3_TENSOR_OP = 103,
+  CUBLAS_GEMM_ALGO4_TENSOR_OP = 104,
+  CUBLAS_GEMM_ALGO5_TENSOR_OP = 105,
+  CUBLAS_GEMM_ALGO6_TENSOR_OP = 106,
+  CUBLAS_GEMM_ALGO7_TENSOR_OP = 107,
+  CUBLAS_GEMM_ALGO8_TENSOR_OP = 108,
+  CUBLAS_GEMM_ALGO9_TENSOR_OP = 109,
+  CUBLAS_GEMM_ALGO10_TENSOR_OP = 110,
+  CUBLAS_GEMM_ALGO11_TENSOR_OP = 111,
+  CUBLAS_GEMM_ALGO12_TENSOR_OP = 112,
+  CUBLAS_GEMM_ALGO13_TENSOR_OP = 113,
+  CUBLAS_GEMM_ALGO14_TENSOR_OP = 114,
+  CUBLAS_GEMM_ALGO15_TENSOR_OP = 115
+} cublasGemmAlgo_t;
+
+/*Enum for default math mode/tensor operation*/
+typedef enum {
+  CUBLAS_DEFAULT_MATH = 0,
+
+  /* deprecated, same effect as using CUBLAS_COMPUTE_32F_FAST_16F, will be removed in a future release */
+  CUBLAS_TENSOR_OP_MATH = 1,
+
+  /* same as using matching _PEDANTIC compute type when using cublas<T>routine calls or cublasEx() calls with
+     cudaDataType as compute type */
+  CUBLAS_PEDANTIC_MATH = 2,
+
+  /* allow accelerating single precision routines using TF32 tensor cores */
+  CUBLAS_TF32_TENSOR_OP_MATH = 3,
+
+  /* flag to force any reductons to use the accumulator type and not output type in case of mixed precision routines
+     with lower size output type */
+  CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION = 16,
+} cublasMath_t;
+
+/* For backward compatibility purposes */
+typedef cudaDataType cublasDataType_t;
+
+/* Enum for compute type
+ *
+ * - default types provide best available performance using all available hardware features
+ *   and guarantee internal storage precision with at least the same precision and range;
+ * - _PEDANTIC types ensure standard arithmetic and exact specified internal storage format;
+ * - _FAST types allow for some loss of precision to enable higher throughput arithmetic.
+ */
+typedef enum {
+  CUBLAS_COMPUTE_16F = 64,           /* half - default */
+  CUBLAS_COMPUTE_16F_PEDANTIC = 65,  /* half - pedantic */
+  CUBLAS_COMPUTE_32F = 68,           /* float - default */
+  CUBLAS_COMPUTE_32F_PEDANTIC = 69,  /* float - pedantic */
+  CUBLAS_COMPUTE_32F_FAST_16F = 74,  /* float - fast, allows down-converting inputs to half or TF32 */
+  CUBLAS_COMPUTE_32F_FAST_16BF = 75, /* float - fast, allows down-converting inputs to bfloat16 or TF32 */
+  CUBLAS_COMPUTE_32F_FAST_TF32 = 77, /* float - fast, allows down-converting inputs to TF32 */
+  CUBLAS_COMPUTE_64F = 70,           /* double - default */
+  CUBLAS_COMPUTE_64F_PEDANTIC = 71,  /* double - pedantic */
+  CUBLAS_COMPUTE_32I = 72,           /* signed 32-bit int - default */
+  CUBLAS_COMPUTE_32I_PEDANTIC = 73,  /* signed 32-bit int - pedantic */
+} cublasComputeType_t;
+
+/* Opaque structure holding CUBLAS library context */
+struct cublasContext;
+typedef struct cublasContext* cublasHandle_t;
+
+/* Cublas logging */
+typedef void (*cublasLogCallback)(const char* msg);
+
+/* cuBLAS Exported API {{{ */
+
+/* --------------- CUBLAS Helper Functions  ---------------- */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t* handle);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int* version);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int* value);
+
+CUBLASAPI size_t CUBLASWINAPI cublasGetCudartVersion(void);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetWorkspace_v2(cublasHandle_t handle,
+                                                            void* workspace,
+                                                            size_t workspaceSizeInBytes);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle, cudaStream_t* streamId);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t* mode);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t mode);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t* mode);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t* mode);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetSmCountTarget(cublasHandle_t handle, int* smCountTarget);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetSmCountTarget(cublasHandle_t handle, int smCountTarget);
+
+CUBLASAPI const char* CUBLASWINAPI cublasGetStatusName(cublasStatus_t status);
+
+CUBLASAPI const char* CUBLASWINAPI cublasGetStatusString(cublasStatus_t status);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn,
+                                                            int logToStdOut,
+                                                            int logToStdErr,
+                                                            const char* logFileName);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetLoggerCallback(cublasLogCallback userCallback);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetLoggerCallback(cublasLogCallback* userCallback);
+
+cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void* x, int incx, void* devicePtr, int incy);
+
+cublasStatus_t CUBLASWINAPI
+cublasSetVector_64(int64_t n, int64_t elemSize, const void* x, int64_t incx, void* devicePtr, int64_t incy);
+
+cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void* x, int incx, void* y, int incy);
+
+cublasStatus_t CUBLASWINAPI
+cublasGetVector_64(int64_t n, int64_t elemSize, const void* x, int64_t incx, void* y, int64_t incy);
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb);
+
+cublasStatus_t CUBLASWINAPI
+cublasSetMatrix_64(int64_t rows, int64_t cols, int64_t elemSize, const void* A, int64_t lda, void* B, int64_t ldb);
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb);
+
+cublasStatus_t CUBLASWINAPI
+cublasGetMatrix_64(int64_t rows, int64_t cols, int64_t elemSize, const void* A, int64_t lda, void* B, int64_t ldb);
+
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync(
+    int n, int elemSize, const void* hostPtr, int incx, void* devicePtr, int incy, cudaStream_t stream);
+
+cublasStatus_t CUBLASWINAPI cublasSetVectorAsync_64(
+    int64_t n, int64_t elemSize, const void* hostPtr, int64_t incx, void* devicePtr, int64_t incy, cudaStream_t stream);
+
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync(
+    int n, int elemSize, const void* devicePtr, int incx, void* hostPtr, int incy, cudaStream_t stream);
+
+cublasStatus_t CUBLASWINAPI cublasGetVectorAsync_64(
+    int64_t n, int64_t elemSize, const void* devicePtr, int64_t incx, void* hostPtr, int64_t incy, cudaStream_t stream);
+
+cublasStatus_t CUBLASWINAPI
+cublasSetMatrixAsync(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb, cudaStream_t stream);
+
+cublasStatus_t CUBLASWINAPI cublasSetMatrixAsync_64(int64_t rows,
+                                                    int64_t cols,
+                                                    int64_t elemSize,
+                                                    const void* A,
+                                                    int64_t lda,
+                                                    void* B,
+                                                    int64_t ldb,
+                                                    cudaStream_t stream);
+
+cublasStatus_t CUBLASWINAPI
+cublasGetMatrixAsync(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb, cudaStream_t stream);
+
+cublasStatus_t CUBLASWINAPI cublasGetMatrixAsync_64(int64_t rows,
+                                                    int64_t cols,
+                                                    int64_t elemSize,
+                                                    const void* A,
+                                                    int64_t lda,
+                                                    void* B,
+                                                    int64_t ldb,
+                                                    cudaStream_t stream);
+
+CUBLASAPI void CUBLASWINAPI cublasXerbla(const char* srName, int info);
+
+/* --------------- CUBLAS BLAS1 Functions  ---------------- */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle,
+                                                   int n,
+                                                   const void* x,
+                                                   cudaDataType xType,
+                                                   int incx,
+                                                   void* result,
+                                                   cudaDataType resultType,
+                                                   cudaDataType executionType);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasNrm2Ex_64(cublasHandle_t handle,
+                                                      int64_t n,
+                                                      const void* x,
+                                                      cudaDataType xType,
+                                                      int64_t incx,
+                                                      void* result,
+                                                      cudaDataType resultType,
+                                                      cudaDataType executionType);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSnrm2_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSnrm2_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, float* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDnrm2_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDnrm2_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, double* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasScnrm2_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasScnrm2_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, float* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDznrm2_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDznrm2_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, double* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle,
+                                                  int n,
+                                                  const void* x,
+                                                  cudaDataType xType,
+                                                  int incx,
+                                                  const void* y,
+                                                  cudaDataType yType,
+                                                  int incy,
+                                                  void* result,
+                                                  cudaDataType resultType,
+                                                  cudaDataType executionType);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotEx_64(cublasHandle_t handle,
+                                                     int64_t n,
+                                                     const void* x,
+                                                     cudaDataType xType,
+                                                     int64_t incx,
+                                                     const void* y,
+                                                     cudaDataType yType,
+                                                     int64_t incy,
+                                                     void* result,
+                                                     cudaDataType resultType,
+                                                     cudaDataType executionType);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle,
+                                                   int n,
+                                                   const void* x,
+                                                   cudaDataType xType,
+                                                   int incx,
+                                                   const void* y,
+                                                   cudaDataType yType,
+                                                   int incy,
+                                                   void* result,
+                                                   cudaDataType resultType,
+                                                   cudaDataType executionType);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotcEx_64(cublasHandle_t handle,
+                                                      int64_t n,
+                                                      const void* x,
+                                                      cudaDataType xType,
+                                                      int64_t incx,
+                                                      const void* y,
+                                                      cudaDataType yType,
+                                                      int64_t incy,
+                                                      void* result,
+                                                      cudaDataType resultType,
+                                                      cudaDataType executionType);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSdot_v2(cublasHandle_t handle, int n, const float* x, int incx, const float* y, int incy, float* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdot_v2_64(
+    cublasHandle_t handle, int64_t n, const float* x, int64_t incx, const float* y, int64_t incy, float* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDdot_v2(cublasHandle_t handle, int n, const double* x, int incx, const double* y, int incy, double* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdot_v2_64(
+    cublasHandle_t handle, int64_t n, const double* x, int64_t incx, const double* y, int64_t incy, double* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotu_v2(
+    cublasHandle_t handle, int n, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotu_v2_64(cublasHandle_t handle,
+                                                        int64_t n,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* y,
+                                                        int64_t incy,
+                                                        cuComplex* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotc_v2(
+    cublasHandle_t handle, int n, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotc_v2_64(cublasHandle_t handle,
+                                                        int64_t n,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* y,
+                                                        int64_t incy,
+                                                        cuComplex* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotu_v2_64(cublasHandle_t handle,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* y,
+                                                        int64_t incy,
+                                                        cuDoubleComplex* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotc_v2_64(cublasHandle_t handle,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* y,
+                                                        int64_t incy,
+                                                        cuDoubleComplex* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle,
+                                                   int n,
+                                                   const void* alpha,
+                                                   cudaDataType alphaType,
+                                                   void* x,
+                                                   cudaDataType xType,
+                                                   int incx,
+                                                   cudaDataType executionType);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScalEx_64(cublasHandle_t handle,
+                                                      int64_t n,
+                                                      const void* alpha,
+                                                      cudaDataType alphaType,
+                                                      void* x,
+                                                      cudaDataType xType,
+                                                      int64_t incx,
+                                                      cudaDataType executionType);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSscal_v2(cublasHandle_t handle, int n, const float* alpha, float* x, int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSscal_v2_64(cublasHandle_t handle, int64_t n, const float* alpha, float* x, int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDscal_v2(cublasHandle_t handle, int n, const double* alpha, double* x, int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDscal_v2_64(cublasHandle_t handle, int64_t n, const double* alpha, double* x, int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCscal_v2(cublasHandle_t handle, int n, const cuComplex* alpha, cuComplex* x, int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCscal_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* alpha, cuComplex* x, int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCsscal_v2(cublasHandle_t handle, int n, const float* alpha, cuComplex* x, int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCsscal_v2_64(cublasHandle_t handle, int64_t n, const float* alpha, cuComplex* x, int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasZscal_v2(cublasHandle_t handle, int n, const cuDoubleComplex* alpha, cuDoubleComplex* x, int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasZscal_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* alpha, cuDoubleComplex* x, int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasZdscal_v2(cublasHandle_t handle, int n, const double* alpha, cuDoubleComplex* x, int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasZdscal_v2_64(cublasHandle_t handle, int64_t n, const double* alpha, cuDoubleComplex* x, int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAxpyEx(cublasHandle_t handle,
+                                                   int n,
+                                                   const void* alpha,
+                                                   cudaDataType alphaType,
+                                                   const void* x,
+                                                   cudaDataType xType,
+                                                   int incx,
+                                                   void* y,
+                                                   cudaDataType yType,
+                                                   int incy,
+                                                   cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAxpyEx_64(cublasHandle_t handle,
+                                                      int64_t n,
+                                                      const void* alpha,
+                                                      cudaDataType alphaType,
+                                                      const void* x,
+                                                      cudaDataType xType,
+                                                      int64_t incx,
+                                                      void* y,
+                                                      cudaDataType yType,
+                                                      int64_t incy,
+                                                      cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSaxpy_v2(cublasHandle_t handle, int n, const float* alpha, const float* x, int incx, float* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSaxpy_v2_64(
+    cublasHandle_t handle, int64_t n, const float* alpha, const float* x, int64_t incx, float* y, int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDaxpy_v2(cublasHandle_t handle, int n, const double* alpha, const double* x, int incx, double* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDaxpy_v2_64(
+    cublasHandle_t handle, int64_t n, const double* alpha, const double* x, int64_t incx, double* y, int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCaxpy_v2(
+    cublasHandle_t handle, int n, const cuComplex* alpha, const cuComplex* x, int incx, cuComplex* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCaxpy_v2_64(cublasHandle_t handle,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        cuComplex* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZaxpy_v2_64(cublasHandle_t handle,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        cuDoubleComplex* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCopyEx(
+    cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, void* y, cudaDataType yType, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCopyEx_64(cublasHandle_t handle,
+                                                      int64_t n,
+                                                      const void* x,
+                                                      cudaDataType xType,
+                                                      int64_t incx,
+                                                      void* y,
+                                                      cudaDataType yType,
+                                                      int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasScopy_v2(cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasScopy_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, float* y, int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDcopy_v2(cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDcopy_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, double* y, int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCcopy_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, cuComplex* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCcopy_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, cuComplex* y, int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasZcopy_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZcopy_v2_64(
+    cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, cuDoubleComplex* y, int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSswap_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSswap_v2_64(cublasHandle_t handle, int64_t n, float* x, int64_t incx, float* y, int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDswap_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDswap_v2_64(cublasHandle_t handle, int64_t n, double* x, int64_t incx, double* y, int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCswap_v2(cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCswap_v2_64(cublasHandle_t handle, int64_t n, cuComplex* x, int64_t incx, cuComplex* y, int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasZswap_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasZswap_v2_64(cublasHandle_t handle, int64_t n, cuDoubleComplex* x, int64_t incx, cuDoubleComplex* y, int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSwapEx(
+    cublasHandle_t handle, int n, void* x, cudaDataType xType, int incx, void* y, cudaDataType yType, int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSwapEx_64(cublasHandle_t handle,
+                                                      int64_t n,
+                                                      void* x,
+                                                      cudaDataType xType,
+                                                      int64_t incx,
+                                                      void* y,
+                                                      cudaDataType yType,
+                                                      int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIsamax_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIsamax_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, int64_t* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIdamax_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIdamax_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, int64_t* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIcamax_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIcamax_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, int64_t* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIzamax_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIzamax_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, int64_t* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIamaxEx(cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIamaxEx_64(cublasHandle_t handle, int64_t n, const void* x, cudaDataType xType, int64_t incx, int64_t* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIsamin_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIsamin_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, int64_t* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIdamin_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIdamin_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, int64_t* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIcamin_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIcamin_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, int64_t* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIzamin_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIzamin_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, int64_t* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIaminEx(cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasIaminEx_64(cublasHandle_t handle, int64_t n, const void* x, cudaDataType xType, int64_t incx, int64_t* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAsumEx(cublasHandle_t handle,
+                                                   int n,
+                                                   const void* x,
+                                                   cudaDataType xType,
+                                                   int incx,
+                                                   void* result,
+                                                   cudaDataType resultType,
+                                                   cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAsumEx_64(cublasHandle_t handle,
+                                                      int64_t n,
+                                                      const void* x,
+                                                      cudaDataType xType,
+                                                      int64_t incx,
+                                                      void* result,
+                                                      cudaDataType resultType,
+                                                      cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSasum_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSasum_v2_64(cublasHandle_t handle, int64_t n, const float* x, int64_t incx, float* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDasum_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDasum_v2_64(cublasHandle_t handle, int64_t n, const double* x, int64_t incx, double* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasScasum_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasScasum_v2_64(cublasHandle_t handle, int64_t n, const cuComplex* x, int64_t incx, float* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDzasum_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDzasum_v2_64(cublasHandle_t handle, int64_t n, const cuDoubleComplex* x, int64_t incx, double* result);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSrot_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, const float* c, const float* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrot_v2_64(
+    cublasHandle_t handle, int64_t n, float* x, int64_t incx, float* y, int64_t incy, const float* c, const float* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDrot_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, const double* c, const double* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrot_v2_64(cublasHandle_t handle,
+                                                       int64_t n,
+                                                       double* x,
+                                                       int64_t incx,
+                                                       double* y,
+                                                       int64_t incy,
+                                                       const double* c,
+                                                       const double* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrot_v2(
+    cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy, const float* c, const cuComplex* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrot_v2_64(cublasHandle_t handle,
+                                                       int64_t n,
+                                                       cuComplex* x,
+                                                       int64_t incx,
+                                                       cuComplex* y,
+                                                       int64_t incy,
+                                                       const float* c,
+                                                       const cuComplex* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsrot_v2(
+    cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy, const float* c, const float* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsrot_v2_64(cublasHandle_t handle,
+                                                        int64_t n,
+                                                        cuComplex* x,
+                                                        int64_t incx,
+                                                        cuComplex* y,
+                                                        int64_t incy,
+                                                        const float* c,
+                                                        const float* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrot_v2(cublasHandle_t handle,
+                                                    int n,
+                                                    cuDoubleComplex* x,
+                                                    int incx,
+                                                    cuDoubleComplex* y,
+                                                    int incy,
+                                                    const double* c,
+                                                    const cuDoubleComplex* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrot_v2_64(cublasHandle_t handle,
+                                                       int64_t n,
+                                                       cuDoubleComplex* x,
+                                                       int64_t incx,
+                                                       cuDoubleComplex* y,
+                                                       int64_t incy,
+                                                       const double* c,
+                                                       const cuDoubleComplex* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle,
+                                                     int n,
+                                                     cuDoubleComplex* x,
+                                                     int incx,
+                                                     cuDoubleComplex* y,
+                                                     int incy,
+                                                     const double* c,
+                                                     const double* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdrot_v2_64(cublasHandle_t handle,
+                                                        int64_t n,
+                                                        cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        cuDoubleComplex* y,
+                                                        int64_t incy,
+                                                        const double* c,
+                                                        const double* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotEx(cublasHandle_t handle,
+                                                  int n,
+                                                  void* x,
+                                                  cudaDataType xType,
+                                                  int incx,
+                                                  void* y,
+                                                  cudaDataType yType,
+                                                  int incy,
+                                                  const void* c,
+                                                  const void* s,
+                                                  cudaDataType csType,
+                                                  cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotEx_64(cublasHandle_t handle,
+                                                     int64_t n,
+                                                     void* x,
+                                                     cudaDataType xType,
+                                                     int64_t incx,
+                                                     void* y,
+                                                     cudaDataType yType,
+                                                     int64_t incy,
+                                                     const void* c,
+                                                     const void* s,
+                                                     cudaDataType csType,
+                                                     cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, float* a, float* b, float* c, float* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, double* a, double* b, double* c, double* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCrotg_v2(cublasHandle_t handle, cuComplex* a, cuComplex* b, float* c, cuComplex* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasZrotg_v2(cublasHandle_t handle, cuDoubleComplex* a, cuDoubleComplex* b, double* c, cuDoubleComplex* s);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle,
+                                                   void* a,
+                                                   void* b,
+                                                   cudaDataType abType,
+                                                   void* c,
+                                                   void* s,
+                                                   cudaDataType csType,
+                                                   cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSrotm_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy, const float* param);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSrotm_v2_64(cublasHandle_t handle, int64_t n, float* x, int64_t incx, float* y, int64_t incy, const float* param);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDrotm_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy, const double* param);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotm_v2_64(
+    cublasHandle_t handle, int64_t n, double* x, int64_t incx, double* y, int64_t incy, const double* param);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotmEx(cublasHandle_t handle,
+                                                   int n,
+                                                   void* x,
+                                                   cudaDataType xType,
+                                                   int incx,
+                                                   void* y,
+                                                   cudaDataType yType,
+                                                   int incy,
+                                                   const void* param,
+                                                   cudaDataType paramType,
+                                                   cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotmEx_64(cublasHandle_t handle,
+                                                      int64_t n,
+                                                      void* x,
+                                                      cudaDataType xType,
+                                                      int64_t incx,
+                                                      void* y,
+                                                      cudaDataType yType,
+                                                      int64_t incy,
+                                                      const void* param,
+                                                      cudaDataType paramType,
+                                                      cudaDataType executiontype);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSrotmg_v2(cublasHandle_t handle, float* d1, float* d2, float* x1, const float* y1, float* param);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDrotmg_v2(cublasHandle_t handle, double* d1, double* d2, double* x1, const double* y1, double* param);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotmgEx(cublasHandle_t handle,
+                                                    void* d1,
+                                                    cudaDataType d1Type,
+                                                    void* d2,
+                                                    cudaDataType d2Type,
+                                                    void* x1,
+                                                    cudaDataType x1Type,
+                                                    const void* y1,
+                                                    cudaDataType y1Type,
+                                                    void* param,
+                                                    cudaDataType paramType,
+                                                    cudaDataType executiontype);
+
+/* --------------- CUBLAS BLAS2 Functions  ---------------- */
+
+/* GEMV */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     const float* alpha,
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* beta,
+                                                     float* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemv_v2_64(cublasHandle_t handle,
+                                                        cublasOperation_t trans,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const float* alpha,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        const float* x,
+                                                        int64_t incx,
+                                                        const float* beta,
+                                                        float* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     const double* alpha,
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* beta,
+                                                     double* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemv_v2_64(cublasHandle_t handle,
+                                                        cublasOperation_t trans,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const double* alpha,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        const double* x,
+                                                        int64_t incx,
+                                                        const double* beta,
+                                                        double* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* beta,
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemv_v2_64(cublasHandle_t handle,
+                                                        cublasOperation_t trans,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* beta,
+                                                        cuComplex* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* beta,
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemv_v2_64(cublasHandle_t handle,
+                                                        cublasOperation_t trans,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* beta,
+                                                        cuDoubleComplex* y,
+                                                        int64_t incy);
+
+/* GBMV */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgbmv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     int kl,
+                                                     int ku,
+                                                     const float* alpha,
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* beta,
+                                                     float* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgbmv_v2_64(cublasHandle_t handle,
+                                                        cublasOperation_t trans,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        int64_t kl,
+                                                        int64_t ku,
+                                                        const float* alpha,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        const float* x,
+                                                        int64_t incx,
+                                                        const float* beta,
+                                                        float* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgbmv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     int kl,
+                                                     int ku,
+                                                     const double* alpha,
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* beta,
+                                                     double* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgbmv_v2_64(cublasHandle_t handle,
+                                                        cublasOperation_t trans,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        int64_t kl,
+                                                        int64_t ku,
+                                                        const double* alpha,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        const double* x,
+                                                        int64_t incx,
+                                                        const double* beta,
+                                                        double* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     int kl,
+                                                     int ku,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* beta,
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgbmv_v2_64(cublasHandle_t handle,
+                                                        cublasOperation_t trans,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        int64_t kl,
+                                                        int64_t ku,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* beta,
+                                                        cuComplex* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(cublasHandle_t handle,
+                                                     cublasOperation_t trans,
+                                                     int m,
+                                                     int n,
+                                                     int kl,
+                                                     int ku,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* beta,
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgbmv_v2_64(cublasHandle_t handle,
+                                                        cublasOperation_t trans,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        int64_t kl,
+                                                        int64_t ku,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* beta,
+                                                        cuDoubleComplex* y,
+                                                        int64_t incy);
+
+/* TRMV */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const float* A,
+                                                     int lda,
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        float* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const double* A,
+                                                     int lda,
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        double* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        cuComplex* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        cuDoubleComplex* x,
+                                                        int64_t incx);
+
+/* TBMV */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const float* A,
+                                                     int lda,
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        float* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const double* A,
+                                                     int lda,
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        double* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        cuComplex* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        cuDoubleComplex* x,
+                                                        int64_t incx);
+
+/* TPMV */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const float* AP,
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const float* AP,
+                                                        float* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const double* AP,
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const double* AP,
+                                                        double* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuComplex* AP,
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const cuComplex* AP,
+                                                        cuComplex* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuDoubleComplex* AP,
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* AP,
+                                                        cuDoubleComplex* x,
+                                                        int64_t incx);
+
+/* TRSV */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const float* A,
+                                                     int lda,
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        float* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const double* A,
+                                                     int lda,
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        double* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        cuComplex* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        cuDoubleComplex* x,
+                                                        int64_t incx);
+
+/* TPSV */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const float* AP,
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpsv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const float* AP,
+                                                        float* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const double* AP,
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpsv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const double* AP,
+                                                        double* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuComplex* AP,
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpsv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const cuComplex* AP,
+                                                        cuComplex* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     const cuDoubleComplex* AP,
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpsv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* AP,
+                                                        cuDoubleComplex* x,
+                                                        int64_t incx);
+
+/* TBSV */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const float* A,
+                                                     int lda,
+                                                     float* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbsv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        float* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const double* A,
+                                                     int lda,
+                                                     double* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbsv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        double* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     cuComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbsv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        cuComplex* x,
+                                                        int64_t incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int n,
+                                                     int k,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     cuDoubleComplex* x,
+                                                     int incx);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbsv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        cuDoubleComplex* x,
+                                                        int64_t incx);
+
+/* SYMV/HEMV */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float* alpha,
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* beta,
+                                                     float* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const float* alpha,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        const float* x,
+                                                        int64_t incx,
+                                                        const float* beta,
+                                                        float* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double* alpha,
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* beta,
+                                                     double* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const double* alpha,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        const double* x,
+                                                        int64_t incx,
+                                                        const double* beta,
+                                                        double* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* beta,
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* beta,
+                                                        cuComplex* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* beta,
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* beta,
+                                                        cuDoubleComplex* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* beta,
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* beta,
+                                                        cuComplex* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* beta,
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* beta,
+                                                        cuDoubleComplex* y,
+                                                        int64_t incy);
+
+/* SBMV/HBMV */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     int k,
+                                                     const float* alpha,
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* beta,
+                                                     float* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsbmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const float* alpha,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        const float* x,
+                                                        int64_t incx,
+                                                        const float* beta,
+                                                        float* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     int k,
+                                                     const double* alpha,
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* beta,
+                                                     double* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsbmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const double* alpha,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        const double* x,
+                                                        int64_t incx,
+                                                        const double* beta,
+                                                        double* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     int k,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* beta,
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChbmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* beta,
+                                                        cuComplex* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     int k,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* beta,
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhbmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* beta,
+                                                        cuDoubleComplex* y,
+                                                        int64_t incy);
+
+/* SPMV/HPMV */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float* alpha,
+                                                     const float* AP,
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* beta,
+                                                     float* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const float* alpha,
+                                                        const float* AP,
+                                                        const float* x,
+                                                        int64_t incx,
+                                                        const float* beta,
+                                                        float* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double* alpha,
+                                                     const double* AP,
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* beta,
+                                                     double* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const double* alpha,
+                                                        const double* AP,
+                                                        const double* x,
+                                                        int64_t incx,
+                                                        const double* beta,
+                                                        double* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* AP,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* beta,
+                                                     cuComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* AP,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* beta,
+                                                        cuComplex* y,
+                                                        int64_t incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpmv_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* AP,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* beta,
+                                                     cuDoubleComplex* y,
+                                                     int incy);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpmv_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* AP,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* beta,
+                                                        cuDoubleComplex* y,
+                                                        int64_t incy);
+
+/* GER */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSger_v2(cublasHandle_t handle,
+                                                    int m,
+                                                    int n,
+                                                    const float* alpha,
+                                                    const float* x,
+                                                    int incx,
+                                                    const float* y,
+                                                    int incy,
+                                                    float* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSger_v2_64(cublasHandle_t handle,
+                                                       int64_t m,
+                                                       int64_t n,
+                                                       const float* alpha,
+                                                       const float* x,
+                                                       int64_t incx,
+                                                       const float* y,
+                                                       int64_t incy,
+                                                       float* A,
+                                                       int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDger_v2(cublasHandle_t handle,
+                                                    int m,
+                                                    int n,
+                                                    const double* alpha,
+                                                    const double* x,
+                                                    int incx,
+                                                    const double* y,
+                                                    int incy,
+                                                    double* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDger_v2_64(cublasHandle_t handle,
+                                                       int64_t m,
+                                                       int64_t n,
+                                                       const double* alpha,
+                                                       const double* x,
+                                                       int64_t incx,
+                                                       const double* y,
+                                                       int64_t incy,
+                                                       double* A,
+                                                       int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeru_v2(cublasHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* y,
+                                                     int incy,
+                                                     cuComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeru_v2_64(cublasHandle_t handle,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* y,
+                                                        int64_t incy,
+                                                        cuComplex* A,
+                                                        int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgerc_v2(cublasHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* y,
+                                                     int incy,
+                                                     cuComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgerc_v2_64(cublasHandle_t handle,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* y,
+                                                        int64_t incy,
+                                                        cuComplex* A,
+                                                        int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeru_v2(cublasHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeru_v2_64(cublasHandle_t handle,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* y,
+                                                        int64_t incy,
+                                                        cuDoubleComplex* A,
+                                                        int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgerc_v2(cublasHandle_t handle,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgerc_v2_64(cublasHandle_t handle,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* y,
+                                                        int64_t incy,
+                                                        cuDoubleComplex* A,
+                                                        int64_t lda);
+
+/* SYR/HER */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const float* alpha,
+                                                    const float* x,
+                                                    int incx,
+                                                    float* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr_v2_64(cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       int64_t n,
+                                                       const float* alpha,
+                                                       const float* x,
+                                                       int64_t incx,
+                                                       float* A,
+                                                       int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const double* alpha,
+                                                    const double* x,
+                                                    int incx,
+                                                    double* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr_v2_64(cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       int64_t n,
+                                                       const double* alpha,
+                                                       const double* x,
+                                                       int64_t incx,
+                                                       double* A,
+                                                       int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const cuComplex* alpha,
+                                                    const cuComplex* x,
+                                                    int incx,
+                                                    cuComplex* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr_v2_64(cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       int64_t n,
+                                                       const cuComplex* alpha,
+                                                       const cuComplex* x,
+                                                       int64_t incx,
+                                                       cuComplex* A,
+                                                       int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const cuDoubleComplex* alpha,
+                                                    const cuDoubleComplex* x,
+                                                    int incx,
+                                                    cuDoubleComplex* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr_v2_64(cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       int64_t n,
+                                                       const cuDoubleComplex* alpha,
+                                                       const cuDoubleComplex* x,
+                                                       int64_t incx,
+                                                       cuDoubleComplex* A,
+                                                       int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const float* alpha,
+                                                    const cuComplex* x,
+                                                    int incx,
+                                                    cuComplex* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher_v2_64(cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       int64_t n,
+                                                       const float* alpha,
+                                                       const cuComplex* x,
+                                                       int64_t incx,
+                                                       cuComplex* A,
+                                                       int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const double* alpha,
+                                                    const cuDoubleComplex* x,
+                                                    int incx,
+                                                    cuDoubleComplex* A,
+                                                    int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher_v2_64(cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       int64_t n,
+                                                       const double* alpha,
+                                                       const cuDoubleComplex* x,
+                                                       int64_t incx,
+                                                       cuDoubleComplex* A,
+                                                       int64_t lda);
+
+/* SPR/HPR */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* alpha, const float* x, int incx, float* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr_v2_64(cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       int64_t n,
+                                                       const float* alpha,
+                                                       const float* x,
+                                                       int64_t incx,
+                                                       float* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr_v2(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* alpha, const double* x, int incx, double* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr_v2_64(cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       int64_t n,
+                                                       const double* alpha,
+                                                       const double* x,
+                                                       int64_t incx,
+                                                       double* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const float* alpha,
+                                                    const cuComplex* x,
+                                                    int incx,
+                                                    cuComplex* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr_v2_64(cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       int64_t n,
+                                                       const float* alpha,
+                                                       const cuComplex* x,
+                                                       int64_t incx,
+                                                       cuComplex* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr_v2(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    int n,
+                                                    const double* alpha,
+                                                    const cuDoubleComplex* x,
+                                                    int incx,
+                                                    cuDoubleComplex* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr_v2_64(cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       int64_t n,
+                                                       const double* alpha,
+                                                       const cuDoubleComplex* x,
+                                                       int64_t incx,
+                                                       cuDoubleComplex* AP);
+
+/* SYR2/HER2 */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float* alpha,
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* y,
+                                                     int incy,
+                                                     float* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const float* alpha,
+                                                        const float* x,
+                                                        int64_t incx,
+                                                        const float* y,
+                                                        int64_t incy,
+                                                        float* A,
+                                                        int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double* alpha,
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* y,
+                                                     int incy,
+                                                     double* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const double* alpha,
+                                                        const double* x,
+                                                        int64_t incx,
+                                                        const double* y,
+                                                        int64_t incy,
+                                                        double* A,
+                                                        int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* y,
+                                                     int incy,
+                                                     cuComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* y,
+                                                        int64_t incy,
+                                                        cuComplex* A,
+                                                        int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* y,
+                                                        int64_t incy,
+                                                        cuDoubleComplex* A,
+                                                        int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* y,
+                                                     int incy,
+                                                     cuComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* y,
+                                                        int64_t incy,
+                                                        cuComplex* A,
+                                                        int64_t lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* A,
+                                                     int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* y,
+                                                        int64_t incy,
+                                                        cuDoubleComplex* A,
+                                                        int64_t lda);
+
+/* SPR2/HPR2 */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const float* alpha,
+                                                     const float* x,
+                                                     int incx,
+                                                     const float* y,
+                                                     int incy,
+                                                     float* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr2_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const float* alpha,
+                                                        const float* x,
+                                                        int64_t incx,
+                                                        const float* y,
+                                                        int64_t incy,
+                                                        float* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const double* alpha,
+                                                     const double* x,
+                                                     int incx,
+                                                     const double* y,
+                                                     int incy,
+                                                     double* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr2_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const double* alpha,
+                                                        const double* x,
+                                                        int64_t incx,
+                                                        const double* y,
+                                                        int64_t incy,
+                                                        double* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* x,
+                                                     int incx,
+                                                     const cuComplex* y,
+                                                     int incy,
+                                                     cuComplex* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr2_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* x,
+                                                        int64_t incx,
+                                                        const cuComplex* y,
+                                                        int64_t incy,
+                                                        cuComplex* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr2_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* x,
+                                                     int incx,
+                                                     const cuDoubleComplex* y,
+                                                     int incy,
+                                                     cuDoubleComplex* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr2_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* x,
+                                                        int64_t incx,
+                                                        const cuDoubleComplex* y,
+                                                        int64_t incy,
+                                                        cuDoubleComplex* AP);
+
+/* BATCH GEMV */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         const float* alpha,
+                                                         const float* const Aarray[],
+                                                         int lda,
+                                                         const float* const xarray[],
+                                                         int incx,
+                                                         const float* beta,
+                                                         float* const yarray[],
+                                                         int incy,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvBatched_64(cublasHandle_t handle,
+                                                            cublasOperation_t trans,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            const float* alpha,
+                                                            const float* const Aarray[],
+                                                            int64_t lda,
+                                                            const float* const xarray[],
+                                                            int64_t incx,
+                                                            const float* beta,
+                                                            float* const yarray[],
+                                                            int64_t incy,
+                                                            int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         const double* alpha,
+                                                         const double* const Aarray[],
+                                                         int lda,
+                                                         const double* const xarray[],
+                                                         int incx,
+                                                         const double* beta,
+                                                         double* const yarray[],
+                                                         int incy,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvBatched_64(cublasHandle_t handle,
+                                                            cublasOperation_t trans,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            const double* alpha,
+                                                            const double* const Aarray[],
+                                                            int64_t lda,
+                                                            const double* const xarray[],
+                                                            int64_t incx,
+                                                            const double* beta,
+                                                            double* const yarray[],
+                                                            int64_t incy,
+                                                            int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         const cuComplex* alpha,
+                                                         const cuComplex* const Aarray[],
+                                                         int lda,
+                                                         const cuComplex* const xarray[],
+                                                         int incx,
+                                                         const cuComplex* beta,
+                                                         cuComplex* const yarray[],
+                                                         int incy,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvBatched_64(cublasHandle_t handle,
+                                                            cublasOperation_t trans,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            const cuComplex* alpha,
+                                                            const cuComplex* const Aarray[],
+                                                            int64_t lda,
+                                                            const cuComplex* const xarray[],
+                                                            int64_t incx,
+                                                            const cuComplex* beta,
+                                                            cuComplex* const yarray[],
+                                                            int64_t incy,
+                                                            int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemvBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         const cuDoubleComplex* alpha,
+                                                         const cuDoubleComplex* const Aarray[],
+                                                         int lda,
+                                                         const cuDoubleComplex* const xarray[],
+                                                         int incx,
+                                                         const cuDoubleComplex* beta,
+                                                         cuDoubleComplex* const yarray[],
+                                                         int incy,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemvBatched_64(cublasHandle_t handle,
+                                                            cublasOperation_t trans,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            const cuDoubleComplex* alpha,
+                                                            const cuDoubleComplex* const Aarray[],
+                                                            int64_t lda,
+                                                            const cuDoubleComplex* const xarray[],
+                                                            int64_t incx,
+                                                            const cuDoubleComplex* beta,
+                                                            cuDoubleComplex* const yarray[],
+                                                            int64_t incy,
+                                                            int64_t batchCount);
+
+#if defined(__cplusplus)
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvBatched(cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           const float* alpha,
+                                                           const __half* const Aarray[],
+                                                           int lda,
+                                                           const __half* const xarray[],
+                                                           int incx,
+                                                           const float* beta,
+                                                           __half* const yarray[],
+                                                           int incy,
+                                                           int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvBatched_64(cublasHandle_t handle,
+                                                              cublasOperation_t trans,
+                                                              int64_t m,
+                                                              int64_t n,
+                                                              const float* alpha,
+                                                              const __half* const Aarray[],
+                                                              int64_t lda,
+                                                              const __half* const xarray[],
+                                                              int64_t incx,
+                                                              const float* beta,
+                                                              __half* const yarray[],
+                                                              int64_t incy,
+                                                              int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvBatched(cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           const float* alpha,
+                                                           const __half* const Aarray[],
+                                                           int lda,
+                                                           const __half* const xarray[],
+                                                           int incx,
+                                                           const float* beta,
+                                                           float* const yarray[],
+                                                           int incy,
+                                                           int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvBatched_64(cublasHandle_t handle,
+                                                              cublasOperation_t trans,
+                                                              int64_t m,
+                                                              int64_t n,
+                                                              const float* alpha,
+                                                              const __half* const Aarray[],
+                                                              int64_t lda,
+                                                              const __half* const xarray[],
+                                                              int64_t incx,
+                                                              const float* beta,
+                                                              float* const yarray[],
+                                                              int64_t incy,
+                                                              int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvBatched(cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           const float* alpha,
+                                                           const __nv_bfloat16* const Aarray[],
+                                                           int lda,
+                                                           const __nv_bfloat16* const xarray[],
+                                                           int incx,
+                                                           const float* beta,
+                                                           __nv_bfloat16* const yarray[],
+                                                           int incy,
+                                                           int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvBatched_64(cublasHandle_t handle,
+                                                              cublasOperation_t trans,
+                                                              int64_t m,
+                                                              int64_t n,
+                                                              const float* alpha,
+                                                              const __nv_bfloat16* const Aarray[],
+                                                              int64_t lda,
+                                                              const __nv_bfloat16* const xarray[],
+                                                              int64_t incx,
+                                                              const float* beta,
+                                                              __nv_bfloat16* const yarray[],
+                                                              int64_t incy,
+                                                              int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvBatched(cublasHandle_t handle,
+                                                           cublasOperation_t trans,
+                                                           int m,
+                                                           int n,
+                                                           const float* alpha,
+                                                           const __nv_bfloat16* const Aarray[],
+                                                           int lda,
+                                                           const __nv_bfloat16* const xarray[],
+                                                           int incx,
+                                                           const float* beta,
+                                                           float* const yarray[],
+                                                           int incy,
+                                                           int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvBatched_64(cublasHandle_t handle,
+                                                              cublasOperation_t trans,
+                                                              int64_t m,
+                                                              int64_t n,
+                                                              const float* alpha,
+                                                              const __nv_bfloat16* const Aarray[],
+                                                              int64_t lda,
+                                                              const __nv_bfloat16* const xarray[],
+                                                              int64_t incx,
+                                                              const float* beta,
+                                                              float* const yarray[],
+                                                              int64_t incy,
+                                                              int64_t batchCount);
+
+#endif
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t trans,
+                                                                int m,
+                                                                int n,
+                                                                const float* alpha,
+                                                                const float* A,
+                                                                int lda,
+                                                                long long int strideA,
+                                                                const float* x,
+                                                                int incx,
+                                                                long long int stridex,
+                                                                const float* beta,
+                                                                float* y,
+                                                                int incy,
+                                                                long long int stridey,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvStridedBatched_64(cublasHandle_t handle,
+                                                                   cublasOperation_t trans,
+                                                                   int64_t m,
+                                                                   int64_t n,
+                                                                   const float* alpha,
+                                                                   const float* A,
+                                                                   int64_t lda,
+                                                                   long long int strideA,
+                                                                   const float* x,
+                                                                   int64_t incx,
+                                                                   long long int stridex,
+                                                                   const float* beta,
+                                                                   float* y,
+                                                                   int64_t incy,
+                                                                   long long int stridey,
+                                                                   int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t trans,
+                                                                int m,
+                                                                int n,
+                                                                const double* alpha,
+                                                                const double* A,
+                                                                int lda,
+                                                                long long int strideA,
+                                                                const double* x,
+                                                                int incx,
+                                                                long long int stridex,
+                                                                const double* beta,
+                                                                double* y,
+                                                                int incy,
+                                                                long long int stridey,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvStridedBatched_64(cublasHandle_t handle,
+                                                                   cublasOperation_t trans,
+                                                                   int64_t m,
+                                                                   int64_t n,
+                                                                   const double* alpha,
+                                                                   const double* A,
+                                                                   int64_t lda,
+                                                                   long long int strideA,
+                                                                   const double* x,
+                                                                   int64_t incx,
+                                                                   long long int stridex,
+                                                                   const double* beta,
+                                                                   double* y,
+                                                                   int64_t incy,
+                                                                   long long int stridey,
+                                                                   int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t trans,
+                                                                int m,
+                                                                int n,
+                                                                const cuComplex* alpha,
+                                                                const cuComplex* A,
+                                                                int lda,
+                                                                long long int strideA,
+                                                                const cuComplex* x,
+                                                                int incx,
+                                                                long long int stridex,
+                                                                const cuComplex* beta,
+                                                                cuComplex* y,
+                                                                int incy,
+                                                                long long int stridey,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvStridedBatched_64(cublasHandle_t handle,
+                                                                   cublasOperation_t trans,
+                                                                   int64_t m,
+                                                                   int64_t n,
+                                                                   const cuComplex* alpha,
+                                                                   const cuComplex* A,
+                                                                   int64_t lda,
+                                                                   long long int strideA,
+                                                                   const cuComplex* x,
+                                                                   int64_t incx,
+                                                                   long long int stridex,
+                                                                   const cuComplex* beta,
+                                                                   cuComplex* y,
+                                                                   int64_t incy,
+                                                                   long long int stridey,
+                                                                   int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemvStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t trans,
+                                                                int m,
+                                                                int n,
+                                                                const cuDoubleComplex* alpha,
+                                                                const cuDoubleComplex* A,
+                                                                int lda,
+                                                                long long int strideA,
+                                                                const cuDoubleComplex* x,
+                                                                int incx,
+                                                                long long int stridex,
+                                                                const cuDoubleComplex* beta,
+                                                                cuDoubleComplex* y,
+                                                                int incy,
+                                                                long long int stridey,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemvStridedBatched_64(cublasHandle_t handle,
+                                                                   cublasOperation_t trans,
+                                                                   int64_t m,
+                                                                   int64_t n,
+                                                                   const cuDoubleComplex* alpha,
+                                                                   const cuDoubleComplex* A,
+                                                                   int64_t lda,
+                                                                   long long int strideA,
+                                                                   const cuDoubleComplex* x,
+                                                                   int64_t incx,
+                                                                   long long int stridex,
+                                                                   const cuDoubleComplex* beta,
+                                                                   cuDoubleComplex* y,
+                                                                   int64_t incy,
+                                                                   long long int stridey,
+                                                                   int64_t batchCount);
+
+#if defined(__cplusplus)
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvStridedBatched(cublasHandle_t handle,
+                                                                  cublasOperation_t trans,
+                                                                  int m,
+                                                                  int n,
+                                                                  const float* alpha,
+                                                                  const __half* A,
+                                                                  int lda,
+                                                                  long long int strideA,
+                                                                  const __half* x,
+                                                                  int incx,
+                                                                  long long int stridex,
+                                                                  const float* beta,
+                                                                  __half* y,
+                                                                  int incy,
+                                                                  long long int stridey,
+                                                                  int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvStridedBatched_64(cublasHandle_t handle,
+                                                                     cublasOperation_t trans,
+                                                                     int64_t m,
+                                                                     int64_t n,
+                                                                     const float* alpha,
+                                                                     const __half* A,
+                                                                     int64_t lda,
+                                                                     long long int strideA,
+                                                                     const __half* x,
+                                                                     int64_t incx,
+                                                                     long long int stridex,
+                                                                     const float* beta,
+                                                                     __half* y,
+                                                                     int64_t incy,
+                                                                     long long int stridey,
+                                                                     int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvStridedBatched(cublasHandle_t handle,
+                                                                  cublasOperation_t trans,
+                                                                  int m,
+                                                                  int n,
+                                                                  const float* alpha,
+                                                                  const __half* A,
+                                                                  int lda,
+                                                                  long long int strideA,
+                                                                  const __half* x,
+                                                                  int incx,
+                                                                  long long int stridex,
+                                                                  const float* beta,
+                                                                  float* y,
+                                                                  int incy,
+                                                                  long long int stridey,
+                                                                  int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvStridedBatched_64(cublasHandle_t handle,
+                                                                     cublasOperation_t trans,
+                                                                     int64_t m,
+                                                                     int64_t n,
+                                                                     const float* alpha,
+                                                                     const __half* A,
+                                                                     int64_t lda,
+                                                                     long long int strideA,
+                                                                     const __half* x,
+                                                                     int64_t incx,
+                                                                     long long int stridex,
+                                                                     const float* beta,
+                                                                     float* y,
+                                                                     int64_t incy,
+                                                                     long long int stridey,
+                                                                     int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvStridedBatched(cublasHandle_t handle,
+                                                                  cublasOperation_t trans,
+                                                                  int m,
+                                                                  int n,
+                                                                  const float* alpha,
+                                                                  const __nv_bfloat16* A,
+                                                                  int lda,
+                                                                  long long int strideA,
+                                                                  const __nv_bfloat16* x,
+                                                                  int incx,
+                                                                  long long int stridex,
+                                                                  const float* beta,
+                                                                  __nv_bfloat16* y,
+                                                                  int incy,
+                                                                  long long int stridey,
+                                                                  int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvStridedBatched_64(cublasHandle_t handle,
+                                                                     cublasOperation_t trans,
+                                                                     int64_t m,
+                                                                     int64_t n,
+                                                                     const float* alpha,
+                                                                     const __nv_bfloat16* A,
+                                                                     int64_t lda,
+                                                                     long long int strideA,
+                                                                     const __nv_bfloat16* x,
+                                                                     int64_t incx,
+                                                                     long long int stridex,
+                                                                     const float* beta,
+                                                                     __nv_bfloat16* y,
+                                                                     int64_t incy,
+                                                                     long long int stridey,
+                                                                     int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvStridedBatched(cublasHandle_t handle,
+                                                                  cublasOperation_t trans,
+                                                                  int m,
+                                                                  int n,
+                                                                  const float* alpha,
+                                                                  const __nv_bfloat16* A,
+                                                                  int lda,
+                                                                  long long int strideA,
+                                                                  const __nv_bfloat16* x,
+                                                                  int incx,
+                                                                  long long int stridex,
+                                                                  const float* beta,
+                                                                  float* y,
+                                                                  int incy,
+                                                                  long long int stridey,
+                                                                  int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvStridedBatched_64(cublasHandle_t handle,
+                                                                     cublasOperation_t trans,
+                                                                     int64_t m,
+                                                                     int64_t n,
+                                                                     const float* alpha,
+                                                                     const __nv_bfloat16* A,
+                                                                     int64_t lda,
+                                                                     long long int strideA,
+                                                                     const __nv_bfloat16* x,
+                                                                     int64_t incx,
+                                                                     long long int stridex,
+                                                                     const float* beta,
+                                                                     float* y,
+                                                                     int64_t incy,
+                                                                     long long int stridey,
+                                                                     int64_t batchCount);
+
+#endif
+
+/* ---------------- CUBLAS BLAS3 Functions ---------------- */
+
+/* GEMM */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int m,
+                                                     int n,
+                                                     int k,
+                                                     const float* alpha,
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* B,
+                                                     int ldb,
+                                                     const float* beta,
+                                                     float* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2_64(cublasHandle_t handle,
+                                                        cublasOperation_t transa,
+                                                        cublasOperation_t transb,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const float* alpha,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        const float* B,
+                                                        int64_t ldb,
+                                                        const float* beta,
+                                                        float* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemm_v2(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int m,
+                                                     int n,
+                                                     int k,
+                                                     const double* alpha,
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* B,
+                                                     int ldb,
+                                                     const double* beta,
+                                                     double* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemm_v2_64(cublasHandle_t handle,
+                                                        cublasOperation_t transa,
+                                                        cublasOperation_t transb,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const double* alpha,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        const double* B,
+                                                        int64_t ldb,
+                                                        const double* beta,
+                                                        double* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm_v2(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int m,
+                                                     int n,
+                                                     int k,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* B,
+                                                     int ldb,
+                                                     const cuComplex* beta,
+                                                     cuComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm_v2_64(cublasHandle_t handle,
+                                                        cublasOperation_t transa,
+                                                        cublasOperation_t transb,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        const cuComplex* B,
+                                                        int64_t ldb,
+                                                        const cuComplex* beta,
+                                                        cuComplex* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3m(cublasHandle_t handle,
+                                                    cublasOperation_t transa,
+                                                    cublasOperation_t transb,
+                                                    int m,
+                                                    int n,
+                                                    int k,
+                                                    const cuComplex* alpha,
+                                                    const cuComplex* A,
+                                                    int lda,
+                                                    const cuComplex* B,
+                                                    int ldb,
+                                                    const cuComplex* beta,
+                                                    cuComplex* C,
+                                                    int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3m_64(cublasHandle_t handle,
+                                                       cublasOperation_t transa,
+                                                       cublasOperation_t transb,
+                                                       int64_t m,
+                                                       int64_t n,
+                                                       int64_t k,
+                                                       const cuComplex* alpha,
+                                                       const cuComplex* A,
+                                                       int64_t lda,
+                                                       const cuComplex* B,
+                                                       int64_t ldb,
+                                                       const cuComplex* beta,
+                                                       cuComplex* C,
+                                                       int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(cublasHandle_t handle,
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb,
+                                                      int m,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex* alpha,
+                                                      const void* A,
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const void* B,
+                                                      cudaDataType Btype,
+                                                      int ldb,
+                                                      const cuComplex* beta,
+                                                      void* C,
+                                                      cudaDataType Ctype,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mEx_64(cublasHandle_t handle,
+                                                         cublasOperation_t transa,
+                                                         cublasOperation_t transb,
+                                                         int64_t m,
+                                                         int64_t n,
+                                                         int64_t k,
+                                                         const cuComplex* alpha,
+                                                         const void* A,
+                                                         cudaDataType Atype,
+                                                         int64_t lda,
+                                                         const void* B,
+                                                         cudaDataType Btype,
+                                                         int64_t ldb,
+                                                         const cuComplex* beta,
+                                                         void* C,
+                                                         cudaDataType Ctype,
+                                                         int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm_v2(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int m,
+                                                     int n,
+                                                     int k,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* B,
+                                                     int ldb,
+                                                     const cuDoubleComplex* beta,
+                                                     cuDoubleComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm_v2_64(cublasHandle_t handle,
+                                                        cublasOperation_t transa,
+                                                        cublasOperation_t transb,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        const cuDoubleComplex* B,
+                                                        int64_t ldb,
+                                                        const cuDoubleComplex* beta,
+                                                        cuDoubleComplex* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm3m(cublasHandle_t handle,
+                                                    cublasOperation_t transa,
+                                                    cublasOperation_t transb,
+                                                    int m,
+                                                    int n,
+                                                    int k,
+                                                    const cuDoubleComplex* alpha,
+                                                    const cuDoubleComplex* A,
+                                                    int lda,
+                                                    const cuDoubleComplex* B,
+                                                    int ldb,
+                                                    const cuDoubleComplex* beta,
+                                                    cuDoubleComplex* C,
+                                                    int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm3m_64(cublasHandle_t handle,
+                                                       cublasOperation_t transa,
+                                                       cublasOperation_t transb,
+                                                       int64_t m,
+                                                       int64_t n,
+                                                       int64_t k,
+                                                       const cuDoubleComplex* alpha,
+                                                       const cuDoubleComplex* A,
+                                                       int64_t lda,
+                                                       const cuDoubleComplex* B,
+                                                       int64_t ldb,
+                                                       const cuDoubleComplex* beta,
+                                                       cuDoubleComplex* C,
+                                                       int64_t ldc);
+
+#if defined(__cplusplus)
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemm(cublasHandle_t handle,
+                                                  cublasOperation_t transa,
+                                                  cublasOperation_t transb,
+                                                  int m,
+                                                  int n,
+                                                  int k,
+                                                  const __half* alpha,
+                                                  const __half* A,
+                                                  int lda,
+                                                  const __half* B,
+                                                  int ldb,
+                                                  const __half* beta,
+                                                  __half* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemm_64(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int64_t m,
+                                                     int64_t n,
+                                                     int64_t k,
+                                                     const __half* alpha,
+                                                     const __half* A,
+                                                     int64_t lda,
+                                                     const __half* B,
+                                                     int64_t ldb,
+                                                     const __half* beta,
+                                                     __half* C,
+                                                     int64_t ldc);
+
+#endif
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmEx(cublasHandle_t handle,
+                                                    cublasOperation_t transa,
+                                                    cublasOperation_t transb,
+                                                    int m,
+                                                    int n,
+                                                    int k,
+                                                    const float* alpha,
+                                                    const void* A,
+                                                    cudaDataType Atype,
+                                                    int lda,
+                                                    const void* B,
+                                                    cudaDataType Btype,
+                                                    int ldb,
+                                                    const float* beta,
+                                                    void* C,
+                                                    cudaDataType Ctype,
+                                                    int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmEx_64(cublasHandle_t handle,
+                                                       cublasOperation_t transa,
+                                                       cublasOperation_t transb,
+                                                       int64_t m,
+                                                       int64_t n,
+                                                       int64_t k,
+                                                       const float* alpha,
+                                                       const void* A,
+                                                       cudaDataType Atype,
+                                                       int64_t lda,
+                                                       const void* B,
+                                                       cudaDataType Btype,
+                                                       int64_t ldb,
+                                                       const float* beta,
+                                                       void* C,
+                                                       cudaDataType Ctype,
+                                                       int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmEx(cublasHandle_t handle,
+                                                   cublasOperation_t transa,
+                                                   cublasOperation_t transb,
+                                                   int m,
+                                                   int n,
+                                                   int k,
+                                                   const void* alpha,
+                                                   const void* A,
+                                                   cudaDataType Atype,
+                                                   int lda,
+                                                   const void* B,
+                                                   cudaDataType Btype,
+                                                   int ldb,
+                                                   const void* beta,
+                                                   void* C,
+                                                   cudaDataType Ctype,
+                                                   int ldc,
+                                                   cublasComputeType_t computeType,
+                                                   cublasGemmAlgo_t algo);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmEx_64(cublasHandle_t handle,
+                                                      cublasOperation_t transa,
+                                                      cublasOperation_t transb,
+                                                      int64_t m,
+                                                      int64_t n,
+                                                      int64_t k,
+                                                      const void* alpha,
+                                                      const void* A,
+                                                      cudaDataType Atype,
+                                                      int64_t lda,
+                                                      const void* B,
+                                                      cudaDataType Btype,
+                                                      int64_t ldb,
+                                                      const void* beta,
+                                                      void* C,
+                                                      cudaDataType Ctype,
+                                                      int64_t ldc,
+                                                      cublasComputeType_t computeType,
+                                                      cublasGemmAlgo_t algo);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmEx(cublasHandle_t handle,
+                                                    cublasOperation_t transa,
+                                                    cublasOperation_t transb,
+                                                    int m,
+                                                    int n,
+                                                    int k,
+                                                    const cuComplex* alpha,
+                                                    const void* A,
+                                                    cudaDataType Atype,
+                                                    int lda,
+                                                    const void* B,
+                                                    cudaDataType Btype,
+                                                    int ldb,
+                                                    const cuComplex* beta,
+                                                    void* C,
+                                                    cudaDataType Ctype,
+                                                    int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmEx_64(cublasHandle_t handle,
+                                                       cublasOperation_t transa,
+                                                       cublasOperation_t transb,
+                                                       int64_t m,
+                                                       int64_t n,
+                                                       int64_t k,
+                                                       const cuComplex* alpha,
+                                                       const void* A,
+                                                       cudaDataType Atype,
+                                                       int64_t lda,
+                                                       const void* B,
+                                                       cudaDataType Btype,
+                                                       int64_t ldb,
+                                                       const cuComplex* beta,
+                                                       void* C,
+                                                       cudaDataType Ctype,
+                                                       int64_t ldc);
+
+/* SYRK */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     int n,
+                                                     int k,
+                                                     const float* alpha,
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* beta,
+                                                     float* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrk_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const float* alpha,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        const float* beta,
+                                                        float* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     int n,
+                                                     int k,
+                                                     const double* alpha,
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* beta,
+                                                     double* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrk_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const double* alpha,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        const double* beta,
+                                                        double* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     int n,
+                                                     int k,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* beta,
+                                                     cuComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        const cuComplex* beta,
+                                                        cuComplex* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     int n,
+                                                     int k,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* beta,
+                                                     cuDoubleComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrk_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        const cuDoubleComplex* beta,
+                                                        cuDoubleComplex* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkEx(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const cuComplex* alpha,
+                                                    const void* A,
+                                                    cudaDataType Atype,
+                                                    int lda,
+                                                    const cuComplex* beta,
+                                                    void* C,
+                                                    cudaDataType Ctype,
+                                                    int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkEx_64(cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int64_t n,
+                                                       int64_t k,
+                                                       const cuComplex* alpha,
+                                                       const void* A,
+                                                       cudaDataType Atype,
+                                                       int64_t lda,
+                                                       const cuComplex* beta,
+                                                       void* C,
+                                                       cudaDataType Ctype,
+                                                       int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex* alpha,
+                                                      const void* A,
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const cuComplex* beta,
+                                                      void* C,
+                                                      cudaDataType Ctype,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx_64(cublasHandle_t handle,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         int64_t n,
+                                                         int64_t k,
+                                                         const cuComplex* alpha,
+                                                         const void* A,
+                                                         cudaDataType Atype,
+                                                         int64_t lda,
+                                                         const cuComplex* beta,
+                                                         void* C,
+                                                         cudaDataType Ctype,
+                                                         int64_t ldc);
+
+/* HERK */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     int n,
+                                                     int k,
+                                                     const float* alpha,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const float* beta,
+                                                     cuComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const float* alpha,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        const float* beta,
+                                                        cuComplex* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherk_v2(cublasHandle_t handle,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     int n,
+                                                     int k,
+                                                     const double* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const double* beta,
+                                                     cuDoubleComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherk_v2_64(cublasHandle_t handle,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        int64_t n,
+                                                        int64_t k,
+                                                        const double* alpha,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        const double* beta,
+                                                        cuDoubleComplex* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkEx(cublasHandle_t handle,
+                                                    cublasFillMode_t uplo,
+                                                    cublasOperation_t trans,
+                                                    int n,
+                                                    int k,
+                                                    const float* alpha,
+                                                    const void* A,
+                                                    cudaDataType Atype,
+                                                    int lda,
+                                                    const float* beta,
+                                                    void* C,
+                                                    cudaDataType Ctype,
+                                                    int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkEx_64(cublasHandle_t handle,
+                                                       cublasFillMode_t uplo,
+                                                       cublasOperation_t trans,
+                                                       int64_t n,
+                                                       int64_t k,
+                                                       const float* alpha,
+                                                       const void* A,
+                                                       cudaDataType Atype,
+                                                       int64_t lda,
+                                                       const float* beta,
+                                                       void* C,
+                                                       cudaDataType Ctype,
+                                                       int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk3mEx(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float* alpha,
+                                                      const void* A,
+                                                      cudaDataType Atype,
+                                                      int lda,
+                                                      const float* beta,
+                                                      void* C,
+                                                      cudaDataType Ctype,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk3mEx_64(cublasHandle_t handle,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         int64_t n,
+                                                         int64_t k,
+                                                         const float* alpha,
+                                                         const void* A,
+                                                         cudaDataType Atype,
+                                                         int64_t lda,
+                                                         const float* beta,
+                                                         void* C,
+                                                         cudaDataType Ctype,
+                                                         int64_t ldc);
+
+/* SYR2K / HER2K */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const float* alpha,
+                                                      const float* A,
+                                                      int lda,
+                                                      const float* B,
+                                                      int ldb,
+                                                      const float* beta,
+                                                      float* C,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2_64(cublasHandle_t handle,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         int64_t n,
+                                                         int64_t k,
+                                                         const float* alpha,
+                                                         const float* A,
+                                                         int64_t lda,
+                                                         const float* B,
+                                                         int64_t ldb,
+                                                         const float* beta,
+                                                         float* C,
+                                                         int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const double* alpha,
+                                                      const double* A,
+                                                      int lda,
+                                                      const double* B,
+                                                      int ldb,
+                                                      const double* beta,
+                                                      double* C,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2_64(cublasHandle_t handle,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         int64_t n,
+                                                         int64_t k,
+                                                         const double* alpha,
+                                                         const double* A,
+                                                         int64_t lda,
+                                                         const double* B,
+                                                         int64_t ldb,
+                                                         const double* beta,
+                                                         double* C,
+                                                         int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex* alpha,
+                                                      const cuComplex* A,
+                                                      int lda,
+                                                      const cuComplex* B,
+                                                      int ldb,
+                                                      const cuComplex* beta,
+                                                      cuComplex* C,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2_64(cublasHandle_t handle,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         int64_t n,
+                                                         int64_t k,
+                                                         const cuComplex* alpha,
+                                                         const cuComplex* A,
+                                                         int64_t lda,
+                                                         const cuComplex* B,
+                                                         int64_t ldb,
+                                                         const cuComplex* beta,
+                                                         cuComplex* C,
+                                                         int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex* alpha,
+                                                      const cuDoubleComplex* A,
+                                                      int lda,
+                                                      const cuDoubleComplex* B,
+                                                      int ldb,
+                                                      const cuDoubleComplex* beta,
+                                                      cuDoubleComplex* C,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2_64(cublasHandle_t handle,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         int64_t n,
+                                                         int64_t k,
+                                                         const cuDoubleComplex* alpha,
+                                                         const cuDoubleComplex* A,
+                                                         int64_t lda,
+                                                         const cuDoubleComplex* B,
+                                                         int64_t ldb,
+                                                         const cuDoubleComplex* beta,
+                                                         cuDoubleComplex* C,
+                                                         int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2k_v2(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuComplex* alpha,
+                                                      const cuComplex* A,
+                                                      int lda,
+                                                      const cuComplex* B,
+                                                      int ldb,
+                                                      const float* beta,
+                                                      cuComplex* C,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2k_v2_64(cublasHandle_t handle,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         int64_t n,
+                                                         int64_t k,
+                                                         const cuComplex* alpha,
+                                                         const cuComplex* A,
+                                                         int64_t lda,
+                                                         const cuComplex* B,
+                                                         int64_t ldb,
+                                                         const float* beta,
+                                                         cuComplex* C,
+                                                         int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2k_v2(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int n,
+                                                      int k,
+                                                      const cuDoubleComplex* alpha,
+                                                      const cuDoubleComplex* A,
+                                                      int lda,
+                                                      const cuDoubleComplex* B,
+                                                      int ldb,
+                                                      const double* beta,
+                                                      cuDoubleComplex* C,
+                                                      int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2k_v2_64(cublasHandle_t handle,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         int64_t n,
+                                                         int64_t k,
+                                                         const cuDoubleComplex* alpha,
+                                                         const cuDoubleComplex* A,
+                                                         int64_t lda,
+                                                         const cuDoubleComplex* B,
+                                                         int64_t ldb,
+                                                         const double* beta,
+                                                         cuDoubleComplex* C,
+                                                         int64_t ldc);
+
+/* SYRKX / HERKX */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrkx(cublasHandle_t handle,
+                                                   cublasFillMode_t uplo,
+                                                   cublasOperation_t trans,
+                                                   int n,
+                                                   int k,
+                                                   const float* alpha,
+                                                   const float* A,
+                                                   int lda,
+                                                   const float* B,
+                                                   int ldb,
+                                                   const float* beta,
+                                                   float* C,
+                                                   int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrkx_64(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int64_t n,
+                                                      int64_t k,
+                                                      const float* alpha,
+                                                      const float* A,
+                                                      int64_t lda,
+                                                      const float* B,
+                                                      int64_t ldb,
+                                                      const float* beta,
+                                                      float* C,
+                                                      int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrkx(cublasHandle_t handle,
+                                                   cublasFillMode_t uplo,
+                                                   cublasOperation_t trans,
+                                                   int n,
+                                                   int k,
+                                                   const double* alpha,
+                                                   const double* A,
+                                                   int lda,
+                                                   const double* B,
+                                                   int ldb,
+                                                   const double* beta,
+                                                   double* C,
+                                                   int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrkx_64(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int64_t n,
+                                                      int64_t k,
+                                                      const double* alpha,
+                                                      const double* A,
+                                                      int64_t lda,
+                                                      const double* B,
+                                                      int64_t ldb,
+                                                      const double* beta,
+                                                      double* C,
+                                                      int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkx(cublasHandle_t handle,
+                                                   cublasFillMode_t uplo,
+                                                   cublasOperation_t trans,
+                                                   int n,
+                                                   int k,
+                                                   const cuComplex* alpha,
+                                                   const cuComplex* A,
+                                                   int lda,
+                                                   const cuComplex* B,
+                                                   int ldb,
+                                                   const cuComplex* beta,
+                                                   cuComplex* C,
+                                                   int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkx_64(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int64_t n,
+                                                      int64_t k,
+                                                      const cuComplex* alpha,
+                                                      const cuComplex* A,
+                                                      int64_t lda,
+                                                      const cuComplex* B,
+                                                      int64_t ldb,
+                                                      const cuComplex* beta,
+                                                      cuComplex* C,
+                                                      int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrkx(cublasHandle_t handle,
+                                                   cublasFillMode_t uplo,
+                                                   cublasOperation_t trans,
+                                                   int n,
+                                                   int k,
+                                                   const cuDoubleComplex* alpha,
+                                                   const cuDoubleComplex* A,
+                                                   int lda,
+                                                   const cuDoubleComplex* B,
+                                                   int ldb,
+                                                   const cuDoubleComplex* beta,
+                                                   cuDoubleComplex* C,
+                                                   int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrkx_64(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int64_t n,
+                                                      int64_t k,
+                                                      const cuDoubleComplex* alpha,
+                                                      const cuDoubleComplex* A,
+                                                      int64_t lda,
+                                                      const cuDoubleComplex* B,
+                                                      int64_t ldb,
+                                                      const cuDoubleComplex* beta,
+                                                      cuDoubleComplex* C,
+                                                      int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkx(cublasHandle_t handle,
+                                                   cublasFillMode_t uplo,
+                                                   cublasOperation_t trans,
+                                                   int n,
+                                                   int k,
+                                                   const cuComplex* alpha,
+                                                   const cuComplex* A,
+                                                   int lda,
+                                                   const cuComplex* B,
+                                                   int ldb,
+                                                   const float* beta,
+                                                   cuComplex* C,
+                                                   int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkx_64(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int64_t n,
+                                                      int64_t k,
+                                                      const cuComplex* alpha,
+                                                      const cuComplex* A,
+                                                      int64_t lda,
+                                                      const cuComplex* B,
+                                                      int64_t ldb,
+                                                      const float* beta,
+                                                      cuComplex* C,
+                                                      int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherkx(cublasHandle_t handle,
+                                                   cublasFillMode_t uplo,
+                                                   cublasOperation_t trans,
+                                                   int n,
+                                                   int k,
+                                                   const cuDoubleComplex* alpha,
+                                                   const cuDoubleComplex* A,
+                                                   int lda,
+                                                   const cuDoubleComplex* B,
+                                                   int ldb,
+                                                   const double* beta,
+                                                   cuDoubleComplex* C,
+                                                   int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherkx_64(cublasHandle_t handle,
+                                                      cublasFillMode_t uplo,
+                                                      cublasOperation_t trans,
+                                                      int64_t n,
+                                                      int64_t k,
+                                                      const cuDoubleComplex* alpha,
+                                                      const cuDoubleComplex* A,
+                                                      int64_t lda,
+                                                      const cuDoubleComplex* B,
+                                                      int64_t ldb,
+                                                      const double* beta,
+                                                      cuDoubleComplex* C,
+                                                      int64_t ldc);
+
+/* SYMM */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     int m,
+                                                     int n,
+                                                     const float* alpha,
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* B,
+                                                     int ldb,
+                                                     const float* beta,
+                                                     float* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const float* alpha,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        const float* B,
+                                                        int64_t ldb,
+                                                        const float* beta,
+                                                        float* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     int m,
+                                                     int n,
+                                                     const double* alpha,
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* B,
+                                                     int ldb,
+                                                     const double* beta,
+                                                     double* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const double* alpha,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        const double* B,
+                                                        int64_t ldb,
+                                                        const double* beta,
+                                                        double* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* B,
+                                                     int ldb,
+                                                     const cuComplex* beta,
+                                                     cuComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        const cuComplex* B,
+                                                        int64_t ldb,
+                                                        const cuComplex* beta,
+                                                        cuComplex* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* B,
+                                                     int ldb,
+                                                     const cuDoubleComplex* beta,
+                                                     cuDoubleComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        const cuDoubleComplex* B,
+                                                        int64_t ldb,
+                                                        const cuDoubleComplex* beta,
+                                                        cuDoubleComplex* C,
+                                                        int64_t ldc);
+
+/* HEMM */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* B,
+                                                     int ldb,
+                                                     const cuComplex* beta,
+                                                     cuComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        const cuComplex* B,
+                                                        int64_t ldb,
+                                                        const cuComplex* beta,
+                                                        cuComplex* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* B,
+                                                     int ldb,
+                                                     const cuDoubleComplex* beta,
+                                                     cuDoubleComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        const cuDoubleComplex* B,
+                                                        int64_t ldb,
+                                                        const cuDoubleComplex* beta,
+                                                        cuDoubleComplex* C,
+                                                        int64_t ldc);
+
+/* TRSM */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const float* alpha,
+                                                     const float* A,
+                                                     int lda,
+                                                     float* B,
+                                                     int ldb);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const float* alpha,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        float* B,
+                                                        int64_t ldb);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const double* alpha,
+                                                     const double* A,
+                                                     int lda,
+                                                     double* B,
+                                                     int ldb);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const double* alpha,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        double* B,
+                                                        int64_t ldb);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     cuComplex* B,
+                                                     int ldb);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        cuComplex* B,
+                                                        int64_t ldb);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     cuDoubleComplex* B,
+                                                     int ldb);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        cuDoubleComplex* B,
+                                                        int64_t ldb);
+
+/* TRMM */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const float* alpha,
+                                                     const float* A,
+                                                     int lda,
+                                                     const float* B,
+                                                     int ldb,
+                                                     float* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const float* alpha,
+                                                        const float* A,
+                                                        int64_t lda,
+                                                        const float* B,
+                                                        int64_t ldb,
+                                                        float* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const double* alpha,
+                                                     const double* A,
+                                                     int lda,
+                                                     const double* B,
+                                                     int ldb,
+                                                     double* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const double* alpha,
+                                                        const double* A,
+                                                        int64_t lda,
+                                                        const double* B,
+                                                        int64_t ldb,
+                                                        double* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* A,
+                                                     int lda,
+                                                     const cuComplex* B,
+                                                     int ldb,
+                                                     cuComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuComplex* alpha,
+                                                        const cuComplex* A,
+                                                        int64_t lda,
+                                                        const cuComplex* B,
+                                                        int64_t ldb,
+                                                        cuComplex* C,
+                                                        int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle,
+                                                     cublasSideMode_t side,
+                                                     cublasFillMode_t uplo,
+                                                     cublasOperation_t trans,
+                                                     cublasDiagType_t diag,
+                                                     int m,
+                                                     int n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int lda,
+                                                     const cuDoubleComplex* B,
+                                                     int ldb,
+                                                     cuDoubleComplex* C,
+                                                     int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmm_v2_64(cublasHandle_t handle,
+                                                        cublasSideMode_t side,
+                                                        cublasFillMode_t uplo,
+                                                        cublasOperation_t trans,
+                                                        cublasDiagType_t diag,
+                                                        int64_t m,
+                                                        int64_t n,
+                                                        const cuDoubleComplex* alpha,
+                                                        const cuDoubleComplex* A,
+                                                        int64_t lda,
+                                                        const cuDoubleComplex* B,
+                                                        int64_t ldb,
+                                                        cuDoubleComplex* C,
+                                                        int64_t ldc);
+
+/* BATCH GEMM */
+
+#if defined(__cplusplus)
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmBatched(cublasHandle_t handle,
+                                                         cublasOperation_t transa,
+                                                         cublasOperation_t transb,
+                                                         int m,
+                                                         int n,
+                                                         int k,
+                                                         const __half* alpha,
+                                                         const __half* const Aarray[],
+                                                         int lda,
+                                                         const __half* const Barray[],
+                                                         int ldb,
+                                                         const __half* beta,
+                                                         __half* const Carray[],
+                                                         int ldc,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmBatched_64(cublasHandle_t handle,
+                                                            cublasOperation_t transa,
+                                                            cublasOperation_t transb,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            int64_t k,
+                                                            const __half* alpha,
+                                                            const __half* const Aarray[],
+                                                            int64_t lda,
+                                                            const __half* const Barray[],
+                                                            int64_t ldb,
+                                                            const __half* beta,
+                                                            __half* const Carray[],
+                                                            int64_t ldc,
+                                                            int64_t batchCount);
+
+#endif
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmBatched(cublasHandle_t handle,
+                                                         cublasOperation_t transa,
+                                                         cublasOperation_t transb,
+                                                         int m,
+                                                         int n,
+                                                         int k,
+                                                         const float* alpha,
+                                                         const float* const Aarray[],
+                                                         int lda,
+                                                         const float* const Barray[],
+                                                         int ldb,
+                                                         const float* beta,
+                                                         float* const Carray[],
+                                                         int ldc,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmBatched_64(cublasHandle_t handle,
+                                                            cublasOperation_t transa,
+                                                            cublasOperation_t transb,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            int64_t k,
+                                                            const float* alpha,
+                                                            const float* const Aarray[],
+                                                            int64_t lda,
+                                                            const float* const Barray[],
+                                                            int64_t ldb,
+                                                            const float* beta,
+                                                            float* const Carray[],
+                                                            int64_t ldc,
+                                                            int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmBatched(cublasHandle_t handle,
+                                                         cublasOperation_t transa,
+                                                         cublasOperation_t transb,
+                                                         int m,
+                                                         int n,
+                                                         int k,
+                                                         const double* alpha,
+                                                         const double* const Aarray[],
+                                                         int lda,
+                                                         const double* const Barray[],
+                                                         int ldb,
+                                                         const double* beta,
+                                                         double* const Carray[],
+                                                         int ldc,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmBatched_64(cublasHandle_t handle,
+                                                            cublasOperation_t transa,
+                                                            cublasOperation_t transb,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            int64_t k,
+                                                            const double* alpha,
+                                                            const double* const Aarray[],
+                                                            int64_t lda,
+                                                            const double* const Barray[],
+                                                            int64_t ldb,
+                                                            const double* beta,
+                                                            double* const Carray[],
+                                                            int64_t ldc,
+                                                            int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmBatched(cublasHandle_t handle,
+                                                         cublasOperation_t transa,
+                                                         cublasOperation_t transb,
+                                                         int m,
+                                                         int n,
+                                                         int k,
+                                                         const cuComplex* alpha,
+                                                         const cuComplex* const Aarray[],
+                                                         int lda,
+                                                         const cuComplex* const Barray[],
+                                                         int ldb,
+                                                         const cuComplex* beta,
+                                                         cuComplex* const Carray[],
+                                                         int ldc,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmBatched_64(cublasHandle_t handle,
+                                                            cublasOperation_t transa,
+                                                            cublasOperation_t transb,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            int64_t k,
+                                                            const cuComplex* alpha,
+                                                            const cuComplex* const Aarray[],
+                                                            int64_t lda,
+                                                            const cuComplex* const Barray[],
+                                                            int64_t ldb,
+                                                            const cuComplex* beta,
+                                                            cuComplex* const Carray[],
+                                                            int64_t ldc,
+                                                            int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(cublasHandle_t handle,
+                                                           cublasOperation_t transa,
+                                                           cublasOperation_t transb,
+                                                           int m,
+                                                           int n,
+                                                           int k,
+                                                           const cuComplex* alpha,
+                                                           const cuComplex* const Aarray[],
+                                                           int lda,
+                                                           const cuComplex* const Barray[],
+                                                           int ldb,
+                                                           const cuComplex* beta,
+                                                           cuComplex* const Carray[],
+                                                           int ldc,
+                                                           int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched_64(cublasHandle_t handle,
+                                                              cublasOperation_t transa,
+                                                              cublasOperation_t transb,
+                                                              int64_t m,
+                                                              int64_t n,
+                                                              int64_t k,
+                                                              const cuComplex* alpha,
+                                                              const cuComplex* const Aarray[],
+                                                              int64_t lda,
+                                                              const cuComplex* const Barray[],
+                                                              int64_t ldb,
+                                                              const cuComplex* beta,
+                                                              cuComplex* const Carray[],
+                                                              int64_t ldc,
+                                                              int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemmBatched(cublasHandle_t handle,
+                                                         cublasOperation_t transa,
+                                                         cublasOperation_t transb,
+                                                         int m,
+                                                         int n,
+                                                         int k,
+                                                         const cuDoubleComplex* alpha,
+                                                         const cuDoubleComplex* const Aarray[],
+                                                         int lda,
+                                                         const cuDoubleComplex* const Barray[],
+                                                         int ldb,
+                                                         const cuDoubleComplex* beta,
+                                                         cuDoubleComplex* const Carray[],
+                                                         int ldc,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemmBatched_64(cublasHandle_t handle,
+                                                            cublasOperation_t transa,
+                                                            cublasOperation_t transb,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            int64_t k,
+                                                            const cuDoubleComplex* alpha,
+                                                            const cuDoubleComplex* const Aarray[],
+                                                            int64_t lda,
+                                                            const cuDoubleComplex* const Barray[],
+                                                            int64_t ldb,
+                                                            const cuDoubleComplex* beta,
+                                                            cuDoubleComplex* const Carray[],
+                                                            int64_t ldc,
+                                                            int64_t batchCount);
+
+#if defined(__cplusplus)
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t transa,
+                                                                cublasOperation_t transb,
+                                                                int m,
+                                                                int n,
+                                                                int k,
+                                                                const __half* alpha,
+                                                                const __half* A,
+                                                                int lda,
+                                                                long long int strideA,
+                                                                const __half* B,
+                                                                int ldb,
+                                                                long long int strideB,
+                                                                const __half* beta,
+                                                                __half* C,
+                                                                int ldc,
+                                                                long long int strideC,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmStridedBatched_64(cublasHandle_t handle,
+                                                                   cublasOperation_t transa,
+                                                                   cublasOperation_t transb,
+                                                                   int64_t m,
+                                                                   int64_t n,
+                                                                   int64_t k,
+                                                                   const __half* alpha,
+                                                                   const __half* A,
+                                                                   int64_t lda,
+                                                                   long long int strideA,
+                                                                   const __half* B,
+                                                                   int64_t ldb,
+                                                                   long long int strideB,
+                                                                   const __half* beta,
+                                                                   __half* C,
+                                                                   int64_t ldc,
+                                                                   long long int strideC,
+                                                                   int64_t batchCount);
+
+#endif
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t transa,
+                                                                cublasOperation_t transb,
+                                                                int m,
+                                                                int n,
+                                                                int k,
+                                                                const float* alpha,
+                                                                const float* A,
+                                                                int lda,
+                                                                long long int strideA,
+                                                                const float* B,
+                                                                int ldb,
+                                                                long long int strideB,
+                                                                const float* beta,
+                                                                float* C,
+                                                                int ldc,
+                                                                long long int strideC,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched_64(cublasHandle_t handle,
+                                                                   cublasOperation_t transa,
+                                                                   cublasOperation_t transb,
+                                                                   int64_t m,
+                                                                   int64_t n,
+                                                                   int64_t k,
+                                                                   const float* alpha,
+                                                                   const float* A,
+                                                                   int64_t lda,
+                                                                   long long int strideA,
+                                                                   const float* B,
+                                                                   int64_t ldb,
+                                                                   long long int strideB,
+                                                                   const float* beta,
+                                                                   float* C,
+                                                                   int64_t ldc,
+                                                                   long long int strideC,
+                                                                   int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t transa,
+                                                                cublasOperation_t transb,
+                                                                int m,
+                                                                int n,
+                                                                int k,
+                                                                const double* alpha,
+                                                                const double* A,
+                                                                int lda,
+                                                                long long int strideA,
+                                                                const double* B,
+                                                                int ldb,
+                                                                long long int strideB,
+                                                                const double* beta,
+                                                                double* C,
+                                                                int ldc,
+                                                                long long int strideC,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched_64(cublasHandle_t handle,
+                                                                   cublasOperation_t transa,
+                                                                   cublasOperation_t transb,
+                                                                   int64_t m,
+                                                                   int64_t n,
+                                                                   int64_t k,
+                                                                   const double* alpha,
+                                                                   const double* A,
+                                                                   int64_t lda,
+                                                                   long long int strideA,
+                                                                   const double* B,
+                                                                   int64_t ldb,
+                                                                   long long int strideB,
+                                                                   const double* beta,
+                                                                   double* C,
+                                                                   int64_t ldc,
+                                                                   long long int strideC,
+                                                                   int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t transa,
+                                                                cublasOperation_t transb,
+                                                                int m,
+                                                                int n,
+                                                                int k,
+                                                                const cuComplex* alpha,
+                                                                const cuComplex* A,
+                                                                int lda,
+                                                                long long int strideA,
+                                                                const cuComplex* B,
+                                                                int ldb,
+                                                                long long int strideB,
+                                                                const cuComplex* beta,
+                                                                cuComplex* C,
+                                                                int ldc,
+                                                                long long int strideC,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched_64(cublasHandle_t handle,
+                                                                   cublasOperation_t transa,
+                                                                   cublasOperation_t transb,
+                                                                   int64_t m,
+                                                                   int64_t n,
+                                                                   int64_t k,
+                                                                   const cuComplex* alpha,
+                                                                   const cuComplex* A,
+                                                                   int64_t lda,
+                                                                   long long int strideA,
+                                                                   const cuComplex* B,
+                                                                   int64_t ldb,
+                                                                   long long int strideB,
+                                                                   const cuComplex* beta,
+                                                                   cuComplex* C,
+                                                                   int64_t ldc,
+                                                                   long long int strideC,
+                                                                   int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(cublasHandle_t handle,
+                                                                  cublasOperation_t transa,
+                                                                  cublasOperation_t transb,
+                                                                  int m,
+                                                                  int n,
+                                                                  int k,
+                                                                  const cuComplex* alpha,
+                                                                  const cuComplex* A,
+                                                                  int lda,
+                                                                  long long int strideA,
+                                                                  const cuComplex* B,
+                                                                  int ldb,
+                                                                  long long int strideB,
+                                                                  const cuComplex* beta,
+                                                                  cuComplex* C,
+                                                                  int ldc,
+                                                                  long long int strideC,
+                                                                  int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched_64(cublasHandle_t handle,
+                                                                     cublasOperation_t transa,
+                                                                     cublasOperation_t transb,
+                                                                     int64_t m,
+                                                                     int64_t n,
+                                                                     int64_t k,
+                                                                     const cuComplex* alpha,
+                                                                     const cuComplex* A,
+                                                                     int64_t lda,
+                                                                     long long int strideA,
+                                                                     const cuComplex* B,
+                                                                     int64_t ldb,
+                                                                     long long int strideB,
+                                                                     const cuComplex* beta,
+                                                                     cuComplex* C,
+                                                                     int64_t ldc,
+                                                                     long long int strideC,
+                                                                     int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched(cublasHandle_t handle,
+                                                                cublasOperation_t transa,
+                                                                cublasOperation_t transb,
+                                                                int m,
+                                                                int n,
+                                                                int k,
+                                                                const cuDoubleComplex* alpha,
+                                                                const cuDoubleComplex* A,
+                                                                int lda,
+                                                                long long int strideA,
+                                                                const cuDoubleComplex* B,
+                                                                int ldb,
+                                                                long long int strideB,
+                                                                const cuDoubleComplex* beta,
+                                                                cuDoubleComplex* C,
+                                                                int ldc,
+                                                                long long int strideC,
+                                                                int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemmStridedBatched_64(cublasHandle_t handle,
+                                                                   cublasOperation_t transa,
+                                                                   cublasOperation_t transb,
+                                                                   int64_t m,
+                                                                   int64_t n,
+                                                                   int64_t k,
+                                                                   const cuDoubleComplex* alpha,
+                                                                   const cuDoubleComplex* A,
+                                                                   int64_t lda,
+                                                                   long long int strideA,
+                                                                   const cuDoubleComplex* B,
+                                                                   int64_t ldb,
+                                                                   long long int strideB,
+                                                                   const cuDoubleComplex* beta,
+                                                                   cuDoubleComplex* C,
+                                                                   int64_t ldc,
+                                                                   long long int strideC,
+                                                                   int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb,
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const void* alpha,
+                                                          const void* const Aarray[],
+                                                          cudaDataType Atype,
+                                                          int lda,
+                                                          const void* const Barray[],
+                                                          cudaDataType Btype,
+                                                          int ldb,
+                                                          const void* beta,
+                                                          void* const Carray[],
+                                                          cudaDataType Ctype,
+                                                          int ldc,
+                                                          int batchCount,
+                                                          cublasComputeType_t computeType,
+                                                          cublasGemmAlgo_t algo);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx_64(cublasHandle_t handle,
+                                                             cublasOperation_t transa,
+                                                             cublasOperation_t transb,
+                                                             int64_t m,
+                                                             int64_t n,
+                                                             int64_t k,
+                                                             const void* alpha,
+                                                             const void* const Aarray[],
+                                                             cudaDataType Atype,
+                                                             int64_t lda,
+                                                             const void* const Barray[],
+                                                             cudaDataType Btype,
+                                                             int64_t ldb,
+                                                             const void* beta,
+                                                             void* const Carray[],
+                                                             cudaDataType Ctype,
+                                                             int64_t ldc,
+                                                             int64_t batchCount,
+                                                             cublasComputeType_t computeType,
+                                                             cublasGemmAlgo_t algo);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(cublasHandle_t handle,
+                                                                 cublasOperation_t transa,
+                                                                 cublasOperation_t transb,
+                                                                 int m,
+                                                                 int n,
+                                                                 int k,
+                                                                 const void* alpha,
+                                                                 const void* A,
+                                                                 cudaDataType Atype,
+                                                                 int lda,
+                                                                 long long int strideA,
+                                                                 const void* B,
+                                                                 cudaDataType Btype,
+                                                                 int ldb,
+                                                                 long long int strideB,
+                                                                 const void* beta,
+                                                                 void* C,
+                                                                 cudaDataType Ctype,
+                                                                 int ldc,
+                                                                 long long int strideC,
+                                                                 int batchCount,
+                                                                 cublasComputeType_t computeType,
+                                                                 cublasGemmAlgo_t algo);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx_64(cublasHandle_t handle,
+                                                                    cublasOperation_t transa,
+                                                                    cublasOperation_t transb,
+                                                                    int64_t m,
+                                                                    int64_t n,
+                                                                    int64_t k,
+                                                                    const void* alpha,
+                                                                    const void* A,
+                                                                    cudaDataType Atype,
+                                                                    int64_t lda,
+                                                                    long long int strideA,
+                                                                    const void* B,
+                                                                    cudaDataType Btype,
+                                                                    int64_t ldb,
+                                                                    long long int strideB,
+                                                                    const void* beta,
+                                                                    void* C,
+                                                                    cudaDataType Ctype,
+                                                                    int64_t ldc,
+                                                                    long long int strideC,
+                                                                    int64_t batchCount,
+                                                                    cublasComputeType_t computeType,
+                                                                    cublasGemmAlgo_t algo);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmGroupedBatched(cublasHandle_t handle,
+                                                                const cublasOperation_t transa_array[],
+                                                                const cublasOperation_t transb_array[],
+                                                                const int m_array[],
+                                                                const int n_array[],
+                                                                const int k_array[],
+                                                                const float alpha_array[],
+                                                                const float* const Aarray[],
+                                                                const int lda_array[],
+                                                                const float* const Barray[],
+                                                                const int ldb_array[],
+                                                                const float beta_array[],
+                                                                float* const Carray[],
+                                                                const int ldc_array[],
+                                                                int group_count,
+                                                                const int group_size[]);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmGroupedBatched_64(cublasHandle_t handle,
+                                                                   const cublasOperation_t transa_array[],
+                                                                   const cublasOperation_t transb_array[],
+                                                                   const int64_t m_array[],
+                                                                   const int64_t n_array[],
+                                                                   const int64_t k_array[],
+                                                                   const float alpha_array[],
+                                                                   const float* const Aarray[],
+                                                                   const int64_t lda_array[],
+                                                                   const float* const Barray[],
+                                                                   const int64_t ldb_array[],
+                                                                   const float beta_array[],
+                                                                   float* const Carray[],
+                                                                   const int64_t ldc_array[],
+                                                                   int64_t group_count,
+                                                                   const int64_t group_size[]);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmGroupedBatched(cublasHandle_t handle,
+                                                                const cublasOperation_t transa_array[],
+                                                                const cublasOperation_t transb_array[],
+                                                                const int m_array[],
+                                                                const int n_array[],
+                                                                const int k_array[],
+                                                                const double alpha_array[],
+                                                                const double* const Aarray[],
+                                                                const int lda_array[],
+                                                                const double* const Barray[],
+                                                                const int ldb_array[],
+                                                                const double beta_array[],
+                                                                double* const Carray[],
+                                                                const int ldc_array[],
+                                                                int group_count,
+                                                                const int group_size[]);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmGroupedBatched_64(cublasHandle_t handle,
+                                                                   const cublasOperation_t transa_array[],
+                                                                   const cublasOperation_t transb_array[],
+                                                                   const int64_t m_array[],
+                                                                   const int64_t n_array[],
+                                                                   const int64_t k_array[],
+                                                                   const double alpha_array[],
+                                                                   const double* const Aarray[],
+                                                                   const int64_t lda_array[],
+                                                                   const double* const Barray[],
+                                                                   const int64_t ldb_array[],
+                                                                   const double beta_array[],
+                                                                   double* const Carray[],
+                                                                   const int64_t ldc_array[],
+                                                                   int64_t group_count,
+                                                                   const int64_t group_size[]);
+
+/* ---------------- CUBLAS BLAS-like Extension ---------------- */
+
+/* GEAM */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa,
+                                                  cublasOperation_t transb,
+                                                  int m,
+                                                  int n,
+                                                  const float* alpha,
+                                                  const float* A,
+                                                  int lda,
+                                                  const float* beta,
+                                                  const float* B,
+                                                  int ldb,
+                                                  float* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeam_64(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int64_t m,
+                                                     int64_t n,
+                                                     const float* alpha,
+                                                     const float* A,
+                                                     int64_t lda,
+                                                     const float* beta,
+                                                     const float* B,
+                                                     int64_t ldb,
+                                                     float* C,
+                                                     int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa,
+                                                  cublasOperation_t transb,
+                                                  int m,
+                                                  int n,
+                                                  const double* alpha,
+                                                  const double* A,
+                                                  int lda,
+                                                  const double* beta,
+                                                  const double* B,
+                                                  int ldb,
+                                                  double* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeam_64(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int64_t m,
+                                                     int64_t n,
+                                                     const double* alpha,
+                                                     const double* A,
+                                                     int64_t lda,
+                                                     const double* beta,
+                                                     const double* B,
+                                                     int64_t ldb,
+                                                     double* C,
+                                                     int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa,
+                                                  cublasOperation_t transb,
+                                                  int m,
+                                                  int n,
+                                                  const cuComplex* alpha,
+                                                  const cuComplex* A,
+                                                  int lda,
+                                                  const cuComplex* beta,
+                                                  const cuComplex* B,
+                                                  int ldb,
+                                                  cuComplex* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeam_64(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int64_t m,
+                                                     int64_t n,
+                                                     const cuComplex* alpha,
+                                                     const cuComplex* A,
+                                                     int64_t lda,
+                                                     const cuComplex* beta,
+                                                     const cuComplex* B,
+                                                     int64_t ldb,
+                                                     cuComplex* C,
+                                                     int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle,
+                                                  cublasOperation_t transa,
+                                                  cublasOperation_t transb,
+                                                  int m,
+                                                  int n,
+                                                  const cuDoubleComplex* alpha,
+                                                  const cuDoubleComplex* A,
+                                                  int lda,
+                                                  const cuDoubleComplex* beta,
+                                                  const cuDoubleComplex* B,
+                                                  int ldb,
+                                                  cuDoubleComplex* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeam_64(cublasHandle_t handle,
+                                                     cublasOperation_t transa,
+                                                     cublasOperation_t transb,
+                                                     int64_t m,
+                                                     int64_t n,
+                                                     const cuDoubleComplex* alpha,
+                                                     const cuDoubleComplex* A,
+                                                     int64_t lda,
+                                                     const cuDoubleComplex* beta,
+                                                     const cuDoubleComplex* B,
+                                                     int64_t ldb,
+                                                     cuDoubleComplex* C,
+                                                     int64_t ldc);
+
+/* TRSM - Batched Triangular Solver */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsmBatched(cublasHandle_t handle,
+                                                         cublasSideMode_t side,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         cublasDiagType_t diag,
+                                                         int m,
+                                                         int n,
+                                                         const float* alpha,
+                                                         const float* const A[],
+                                                         int lda,
+                                                         float* const B[],
+                                                         int ldb,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsmBatched_64(cublasHandle_t handle,
+                                                            cublasSideMode_t side,
+                                                            cublasFillMode_t uplo,
+                                                            cublasOperation_t trans,
+                                                            cublasDiagType_t diag,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            const float* alpha,
+                                                            const float* const A[],
+                                                            int64_t lda,
+                                                            float* const B[],
+                                                            int64_t ldb,
+                                                            int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(cublasHandle_t handle,
+                                                         cublasSideMode_t side,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         cublasDiagType_t diag,
+                                                         int m,
+                                                         int n,
+                                                         const double* alpha,
+                                                         const double* const A[],
+                                                         int lda,
+                                                         double* const B[],
+                                                         int ldb,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsmBatched_64(cublasHandle_t handle,
+                                                            cublasSideMode_t side,
+                                                            cublasFillMode_t uplo,
+                                                            cublasOperation_t trans,
+                                                            cublasDiagType_t diag,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            const double* alpha,
+                                                            const double* const A[],
+                                                            int64_t lda,
+                                                            double* const B[],
+                                                            int64_t ldb,
+                                                            int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(cublasHandle_t handle,
+                                                         cublasSideMode_t side,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         cublasDiagType_t diag,
+                                                         int m,
+                                                         int n,
+                                                         const cuComplex* alpha,
+                                                         const cuComplex* const A[],
+                                                         int lda,
+                                                         cuComplex* const B[],
+                                                         int ldb,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsmBatched_64(cublasHandle_t handle,
+                                                            cublasSideMode_t side,
+                                                            cublasFillMode_t uplo,
+                                                            cublasOperation_t trans,
+                                                            cublasDiagType_t diag,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            const cuComplex* alpha,
+                                                            const cuComplex* const A[],
+                                                            int64_t lda,
+                                                            cuComplex* const B[],
+                                                            int64_t ldb,
+                                                            int64_t batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(cublasHandle_t handle,
+                                                         cublasSideMode_t side,
+                                                         cublasFillMode_t uplo,
+                                                         cublasOperation_t trans,
+                                                         cublasDiagType_t diag,
+                                                         int m,
+                                                         int n,
+                                                         const cuDoubleComplex* alpha,
+                                                         const cuDoubleComplex* const A[],
+                                                         int lda,
+                                                         cuDoubleComplex* const B[],
+                                                         int ldb,
+                                                         int batchCount);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsmBatched_64(cublasHandle_t handle,
+                                                            cublasSideMode_t side,
+                                                            cublasFillMode_t uplo,
+                                                            cublasOperation_t trans,
+                                                            cublasDiagType_t diag,
+                                                            int64_t m,
+                                                            int64_t n,
+                                                            const cuDoubleComplex* alpha,
+                                                            const cuDoubleComplex* const A[],
+                                                            int64_t lda,
+                                                            cuDoubleComplex* const B[],
+                                                            int64_t ldb,
+                                                            int64_t batchCount);
+
+/* DGMM */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode,
+                                                  int m,
+                                                  int n,
+                                                  const float* A,
+                                                  int lda,
+                                                  const float* x,
+                                                  int incx,
+                                                  float* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdgmm_64(cublasHandle_t handle,
+                                                     cublasSideMode_t mode,
+                                                     int64_t m,
+                                                     int64_t n,
+                                                     const float* A,
+                                                     int64_t lda,
+                                                     const float* x,
+                                                     int64_t incx,
+                                                     float* C,
+                                                     int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode,
+                                                  int m,
+                                                  int n,
+                                                  const double* A,
+                                                  int lda,
+                                                  const double* x,
+                                                  int incx,
+                                                  double* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdgmm_64(cublasHandle_t handle,
+                                                     cublasSideMode_t mode,
+                                                     int64_t m,
+                                                     int64_t n,
+                                                     const double* A,
+                                                     int64_t lda,
+                                                     const double* x,
+                                                     int64_t incx,
+                                                     double* C,
+                                                     int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode,
+                                                  int m,
+                                                  int n,
+                                                  const cuComplex* A,
+                                                  int lda,
+                                                  const cuComplex* x,
+                                                  int incx,
+                                                  cuComplex* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdgmm_64(cublasHandle_t handle,
+                                                     cublasSideMode_t mode,
+                                                     int64_t m,
+                                                     int64_t n,
+                                                     const cuComplex* A,
+                                                     int64_t lda,
+                                                     const cuComplex* x,
+                                                     int64_t incx,
+                                                     cuComplex* C,
+                                                     int64_t ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle,
+                                                  cublasSideMode_t mode,
+                                                  int m,
+                                                  int n,
+                                                  const cuDoubleComplex* A,
+                                                  int lda,
+                                                  const cuDoubleComplex* x,
+                                                  int incx,
+                                                  cuDoubleComplex* C,
+                                                  int ldc);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdgmm_64(cublasHandle_t handle,
+                                                     cublasSideMode_t mode,
+                                                     int64_t m,
+                                                     int64_t n,
+                                                     const cuDoubleComplex* A,
+                                                     int64_t lda,
+                                                     const cuDoubleComplex* x,
+                                                     int64_t incx,
+                                                     cuDoubleComplex* C,
+                                                     int64_t ldc);
+
+/* Batched - MATINV*/
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle,
+                                                           int n,
+                                                           const float* const A[],
+                                                           int lda,
+                                                           float* const Ainv[],
+                                                           int lda_inv,
+                                                           int* info,
+                                                           int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle,
+                                                           int n,
+                                                           const double* const A[],
+                                                           int lda,
+                                                           double* const Ainv[],
+                                                           int lda_inv,
+                                                           int* info,
+                                                           int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle,
+                                                           int n,
+                                                           const cuComplex* const A[],
+                                                           int lda,
+                                                           cuComplex* const Ainv[],
+                                                           int lda_inv,
+                                                           int* info,
+                                                           int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle,
+                                                           int n,
+                                                           const cuDoubleComplex* const A[],
+                                                           int lda,
+                                                           cuDoubleComplex* const Ainv[],
+                                                           int lda_inv,
+                                                           int* info,
+                                                           int batchSize);
+
+/* Batch QR Factorization */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched(cublasHandle_t handle,
+                                                          int m,
+                                                          int n,
+                                                          float* const Aarray[],
+                                                          int lda,
+                                                          float* const TauArray[],
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeqrfBatched(cublasHandle_t handle,
+                                                          int m,
+                                                          int n,
+                                                          double* const Aarray[],
+                                                          int lda,
+                                                          double* const TauArray[],
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeqrfBatched(cublasHandle_t handle,
+                                                          int m,
+                                                          int n,
+                                                          cuComplex* const Aarray[],
+                                                          int lda,
+                                                          cuComplex* const TauArray[],
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(cublasHandle_t handle,
+                                                          int m,
+                                                          int n,
+                                                          cuDoubleComplex* const Aarray[],
+                                                          int lda,
+                                                          cuDoubleComplex* const TauArray[],
+                                                          int* info,
+                                                          int batchSize);
+
+/* Least Square Min only m >= n and Non-transpose supported */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgelsBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         int nrhs,
+                                                         float* const Aarray[],
+                                                         int lda,
+                                                         float* const Carray[],
+                                                         int ldc,
+                                                         int* info,
+                                                         int* devInfoArray,
+                                                         int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgelsBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         int nrhs,
+                                                         double* const Aarray[],
+                                                         int lda,
+                                                         double* const Carray[],
+                                                         int ldc,
+                                                         int* info,
+                                                         int* devInfoArray,
+                                                         int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgelsBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         int nrhs,
+                                                         cuComplex* const Aarray[],
+                                                         int lda,
+                                                         cuComplex* const Carray[],
+                                                         int ldc,
+                                                         int* info,
+                                                         int* devInfoArray,
+                                                         int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgelsBatched(cublasHandle_t handle,
+                                                         cublasOperation_t trans,
+                                                         int m,
+                                                         int n,
+                                                         int nrhs,
+                                                         cuDoubleComplex* const Aarray[],
+                                                         int lda,
+                                                         cuDoubleComplex* const Carray[],
+                                                         int ldc,
+                                                         int* info,
+                                                         int* devInfoArray,
+                                                         int batchSize);
+
+/* TPTTR : Triangular Pack format to Triangular format */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* AP, float* A, int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* AP, double* A, int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* AP, cuComplex* A, int lda);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpttr(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* AP, cuDoubleComplex* A, int lda);
+
+/* TRTTP : Triangular format to Triangular Pack format */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* A, int lda, float* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* A, int lda, double* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* A, int lda, cuComplex* AP);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrttp(
+    cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* AP);
+
+/* Batched LU - GETRF*/
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasSgetrfBatched(cublasHandle_t handle, int n, float* const A[], int lda, int* P, int* info, int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasDgetrfBatched(cublasHandle_t handle, int n, double* const A[], int lda, int* P, int* info, int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI
+cublasCgetrfBatched(cublasHandle_t handle, int n, cuComplex* const A[], int lda, int* P, int* info, int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(
+    cublasHandle_t handle, int n, cuDoubleComplex* const A[], int lda, int* P, int* info, int batchSize);
+
+/* Batched inversion based on LU factorization from getrf */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle,
+                                                          int n,
+                                                          const float* const A[],
+                                                          int lda,
+                                                          const int* P,
+                                                          float* const C[],
+                                                          int ldc,
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle,
+                                                          int n,
+                                                          const double* const A[],
+                                                          int lda,
+                                                          const int* P,
+                                                          double* const C[],
+                                                          int ldc,
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle,
+                                                          int n,
+                                                          const cuComplex* const A[],
+                                                          int lda,
+                                                          const int* P,
+                                                          cuComplex* const C[],
+                                                          int ldc,
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle,
+                                                          int n,
+                                                          const cuDoubleComplex* const A[],
+                                                          int lda,
+                                                          const int* P,
+                                                          cuDoubleComplex* const C[],
+                                                          int ldc,
+                                                          int* info,
+                                                          int batchSize);
+
+/* Batched solver based on LU factorization from getrf */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(cublasHandle_t handle,
+                                                          cublasOperation_t trans,
+                                                          int n,
+                                                          int nrhs,
+                                                          const float* const Aarray[],
+                                                          int lda,
+                                                          const int* devIpiv,
+                                                          float* const Barray[],
+                                                          int ldb,
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(cublasHandle_t handle,
+                                                          cublasOperation_t trans,
+                                                          int n,
+                                                          int nrhs,
+                                                          const double* const Aarray[],
+                                                          int lda,
+                                                          const int* devIpiv,
+                                                          double* const Barray[],
+                                                          int ldb,
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(cublasHandle_t handle,
+                                                          cublasOperation_t trans,
+                                                          int n,
+                                                          int nrhs,
+                                                          const cuComplex* const Aarray[],
+                                                          int lda,
+                                                          const int* devIpiv,
+                                                          cuComplex* const Barray[],
+                                                          int ldb,
+                                                          int* info,
+                                                          int batchSize);
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(cublasHandle_t handle,
+                                                          cublasOperation_t trans,
+                                                          int n,
+                                                          int nrhs,
+                                                          const cuDoubleComplex* const Aarray[],
+                                                          int lda,
+                                                          const int* devIpiv,
+                                                          cuDoubleComplex* const Barray[],
+                                                          int ldb,
+                                                          int* info,
+                                                          int batchSize);
+
+/* Deprecated */
+
+CUBLASAPI cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(cublasHandle_t handle,
+                                                          cublasOperation_t transa,
+                                                          cublasOperation_t transb,
+                                                          cublasOperation_t transc,
+                                                          int m,
+                                                          int n,
+                                                          int k,
+                                                          const unsigned char* A,
+                                                          int A_bias,
+                                                          int lda,
+                                                          const unsigned char* B,
+                                                          int B_bias,
+                                                          int ldb,
+                                                          unsigned char* C,
+                                                          int C_bias,
+                                                          int ldc,
+                                                          int C_mult,
+                                                          int C_shift);
+
+/* }}} cuBLAS Exported API */
+
+#if defined(__cplusplus)
+}
+
+static inline cublasStatus_t cublasMigrateComputeType(cublasHandle_t handle,
+                                                      cudaDataType_t dataType,
+                                                      cublasComputeType_t* computeType) {
+  cublasMath_t mathMode = CUBLAS_DEFAULT_MATH;
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+
+  status = cublasGetMathMode(handle, &mathMode);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return status;
+  }
+
+  bool isPedantic = ((mathMode & 0xf) == CUBLAS_PEDANTIC_MATH);
+
+  switch (dataType) {
+    case CUDA_R_32F:
+    case CUDA_C_32F:
+      *computeType = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F;
+      return CUBLAS_STATUS_SUCCESS;
+    case CUDA_R_64F:
+    case CUDA_C_64F:
+      *computeType = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F;
+      return CUBLAS_STATUS_SUCCESS;
+    case CUDA_R_16F:
+      *computeType = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F;
+      return CUBLAS_STATUS_SUCCESS;
+    case CUDA_R_32I:
+      *computeType = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I;
+      return CUBLAS_STATUS_SUCCESS;
+    default:
+      return CUBLAS_STATUS_NOT_SUPPORTED;
+  }
+}
+/* wrappers to accept old code with cudaDataType computeType when referenced from c++ code */
+static inline cublasStatus_t cublasGemmEx(cublasHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          int m,
+                                          int n,
+                                          int k,
+                                          const void* alpha, /* host or device pointer */
+                                          const void* A,
+                                          cudaDataType Atype,
+                                          int lda,
+                                          const void* B,
+                                          cudaDataType Btype,
+                                          int ldb,
+                                          const void* beta, /* host or device pointer */
+                                          void* C,
+                                          cudaDataType Ctype,
+                                          int ldc,
+                                          cudaDataType computeType,
+                                          cublasGemmAlgo_t algo) {
+  cublasComputeType_t migratedComputeType = CUBLAS_COMPUTE_32F;
+  cublasStatus_t status = CUBLAS_STATUS_SUCCESS;
+  status = cublasMigrateComputeType(handle, computeType, &migratedComputeType);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return status;
+  }
+
+  return cublasGemmEx(handle,
+                      transa,
+                      transb,
+                      m,
+                      n,
+                      k,
+                      alpha,
+                      A,
+                      Atype,
+                      lda,
+                      B,
+                      Btype,
+                      ldb,
+                      beta,
+                      C,
+                      Ctype,
+                      ldc,
+                      migratedComputeType,
+                      algo);
+}
+
+static inline cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle,
+                                                 cublasOperation_t transa,
+                                                 cublasOperation_t transb,
+                                                 int m,
+                                                 int n,
+                                                 int k,
+                                                 const void* alpha, /* host or device pointer */
+                                                 const void* const Aarray[],
+                                                 cudaDataType Atype,
+                                                 int lda,
+                                                 const void* const Barray[],
+                                                 cudaDataType Btype,
+                                                 int ldb,
+                                                 const void* beta, /* host or device pointer */
+                                                 void* const Carray[],
+                                                 cudaDataType Ctype,
+                                                 int ldc,
+                                                 int batchCount,
+                                                 cudaDataType computeType,
+                                                 cublasGemmAlgo_t algo) {
+  cublasComputeType_t migratedComputeType;
+  cublasStatus_t status;
+  status = cublasMigrateComputeType(handle, computeType, &migratedComputeType);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return status;
+  }
+
+  return cublasGemmBatchedEx(handle,
+                             transa,
+                             transb,
+                             m,
+                             n,
+                             k,
+                             alpha,
+                             Aarray,
+                             Atype,
+                             lda,
+                             Barray,
+                             Btype,
+                             ldb,
+                             beta,
+                             Carray,
+                             Ctype,
+                             ldc,
+                             batchCount,
+                             migratedComputeType,
+                             algo);
+}
+
+static inline cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle,
+                                                        cublasOperation_t transa,
+                                                        cublasOperation_t transb,
+                                                        int m,
+                                                        int n,
+                                                        int k,
+                                                        const void* alpha, /* host or device pointer */
+                                                        const void* A,
+                                                        cudaDataType Atype,
+                                                        int lda,
+                                                        long long int strideA, /* purposely signed */
+                                                        const void* B,
+                                                        cudaDataType Btype,
+                                                        int ldb,
+                                                        long long int strideB,
+                                                        const void* beta, /* host or device pointer */
+                                                        void* C,
+                                                        cudaDataType Ctype,
+                                                        int ldc,
+                                                        long long int strideC,
+                                                        int batchCount,
+                                                        cudaDataType computeType,
+                                                        cublasGemmAlgo_t algo) {
+  cublasComputeType_t migratedComputeType;
+  cublasStatus_t status;
+  status = cublasMigrateComputeType(handle, computeType, &migratedComputeType);
+  if (status != CUBLAS_STATUS_SUCCESS) {
+    return status;
+  }
+
+  return cublasGemmStridedBatchedEx(handle,
+                                    transa,
+                                    transb,
+                                    m,
+                                    n,
+                                    k,
+                                    alpha,
+                                    A,
+                                    Atype,
+                                    lda,
+                                    strideA,
+                                    B,
+                                    Btype,
+                                    ldb,
+                                    strideB,
+                                    beta,
+                                    C,
+                                    Ctype,
+                                    ldc,
+                                    strideC,
+                                    batchCount,
+                                    migratedComputeType,
+                                    algo);
+}
+#endif /* __cplusplus */
+
+#endif /* !defined(CUBLAS_API_H_) */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_v2.h b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_v2.h
new file mode 100644
index 0000000000000000000000000000000000000000..bd81a3b1d8e7e3d04d6c54f4c0640af7d8893eab
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_v2.h
@@ -0,0 +1,478 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ * This is the public header file for the new CUBLAS library API, it mapped the generic
+ * Cublas name functions to the actual _v2 implementations.
+ */
+
+#if !defined(CUBLAS_V2_H_)
+#define CUBLAS_V2_H_
+
+#if defined(CUBLAS_H_)
+#error "It is an error to include both cublas.h and cublas_v2.h"
+#endif
+
+#undef CUBLASAPI
+#ifdef __CUDACC__
+#define CUBLASAPI __host__ __device__
+#else
+#define CUBLASAPI
+#endif
+
+#include "cublas_api.h"
+
+#define cublasCreate cublasCreate_v2
+#define cublasDestroy cublasDestroy_v2
+#define cublasGetVersion cublasGetVersion_v2
+#define cublasSetWorkspace cublasSetWorkspace_v2
+#define cublasSetStream cublasSetStream_v2
+#define cublasGetStream cublasGetStream_v2
+#define cublasGetPointerMode cublasGetPointerMode_v2
+#define cublasSetPointerMode cublasSetPointerMode_v2
+
+/* 32-bit integer */
+
+/* Blas1 Routines   */
+
+#define cublasSnrm2 cublasSnrm2_v2
+#define cublasDnrm2 cublasDnrm2_v2
+#define cublasScnrm2 cublasScnrm2_v2
+#define cublasDznrm2 cublasDznrm2_v2
+
+#define cublasSdot cublasSdot_v2
+#define cublasDdot cublasDdot_v2
+#define cublasCdotu cublasCdotu_v2
+#define cublasCdotc cublasCdotc_v2
+#define cublasZdotu cublasZdotu_v2
+#define cublasZdotc cublasZdotc_v2
+
+#define cublasSscal cublasSscal_v2
+#define cublasDscal cublasDscal_v2
+#define cublasCscal cublasCscal_v2
+#define cublasCsscal cublasCsscal_v2
+#define cublasZscal cublasZscal_v2
+#define cublasZdscal cublasZdscal_v2
+
+#define cublasSaxpy cublasSaxpy_v2
+#define cublasDaxpy cublasDaxpy_v2
+#define cublasCaxpy cublasCaxpy_v2
+#define cublasZaxpy cublasZaxpy_v2
+
+#define cublasScopy cublasScopy_v2
+#define cublasDcopy cublasDcopy_v2
+#define cublasCcopy cublasCcopy_v2
+#define cublasZcopy cublasZcopy_v2
+
+#define cublasSswap cublasSswap_v2
+#define cublasDswap cublasDswap_v2
+#define cublasCswap cublasCswap_v2
+#define cublasZswap cublasZswap_v2
+
+#define cublasIsamax cublasIsamax_v2
+#define cublasIdamax cublasIdamax_v2
+#define cublasIcamax cublasIcamax_v2
+#define cublasIzamax cublasIzamax_v2
+
+#define cublasIsamin cublasIsamin_v2
+#define cublasIdamin cublasIdamin_v2
+#define cublasIcamin cublasIcamin_v2
+#define cublasIzamin cublasIzamin_v2
+
+#define cublasSasum cublasSasum_v2
+#define cublasDasum cublasDasum_v2
+#define cublasScasum cublasScasum_v2
+#define cublasDzasum cublasDzasum_v2
+
+#define cublasSrot cublasSrot_v2
+#define cublasDrot cublasDrot_v2
+#define cublasCrot cublasCrot_v2
+#define cublasCsrot cublasCsrot_v2
+#define cublasZrot cublasZrot_v2
+#define cublasZdrot cublasZdrot_v2
+
+#define cublasSrotg cublasSrotg_v2
+#define cublasDrotg cublasDrotg_v2
+#define cublasCrotg cublasCrotg_v2
+#define cublasZrotg cublasZrotg_v2
+
+#define cublasSrotm cublasSrotm_v2
+#define cublasDrotm cublasDrotm_v2
+
+#define cublasSrotmg cublasSrotmg_v2
+#define cublasDrotmg cublasDrotmg_v2
+
+/* Blas2 Routines */
+
+#define cublasSgemv cublasSgemv_v2
+#define cublasDgemv cublasDgemv_v2
+#define cublasCgemv cublasCgemv_v2
+#define cublasZgemv cublasZgemv_v2
+
+#define cublasSgbmv cublasSgbmv_v2
+#define cublasDgbmv cublasDgbmv_v2
+#define cublasCgbmv cublasCgbmv_v2
+#define cublasZgbmv cublasZgbmv_v2
+
+#define cublasStrmv cublasStrmv_v2
+#define cublasDtrmv cublasDtrmv_v2
+#define cublasCtrmv cublasCtrmv_v2
+#define cublasZtrmv cublasZtrmv_v2
+
+#define cublasStbmv cublasStbmv_v2
+#define cublasDtbmv cublasDtbmv_v2
+#define cublasCtbmv cublasCtbmv_v2
+#define cublasZtbmv cublasZtbmv_v2
+
+#define cublasStpmv cublasStpmv_v2
+#define cublasDtpmv cublasDtpmv_v2
+#define cublasCtpmv cublasCtpmv_v2
+#define cublasZtpmv cublasZtpmv_v2
+
+#define cublasStrsv cublasStrsv_v2
+#define cublasDtrsv cublasDtrsv_v2
+#define cublasCtrsv cublasCtrsv_v2
+#define cublasZtrsv cublasZtrsv_v2
+
+#define cublasStpsv cublasStpsv_v2
+#define cublasDtpsv cublasDtpsv_v2
+#define cublasCtpsv cublasCtpsv_v2
+#define cublasZtpsv cublasZtpsv_v2
+
+#define cublasStbsv cublasStbsv_v2
+#define cublasDtbsv cublasDtbsv_v2
+#define cublasCtbsv cublasCtbsv_v2
+#define cublasZtbsv cublasZtbsv_v2
+
+#define cublasSsymv cublasSsymv_v2
+#define cublasDsymv cublasDsymv_v2
+#define cublasCsymv cublasCsymv_v2
+#define cublasZsymv cublasZsymv_v2
+#define cublasChemv cublasChemv_v2
+#define cublasZhemv cublasZhemv_v2
+
+#define cublasSsbmv cublasSsbmv_v2
+#define cublasDsbmv cublasDsbmv_v2
+#define cublasChbmv cublasChbmv_v2
+#define cublasZhbmv cublasZhbmv_v2
+
+#define cublasSspmv cublasSspmv_v2
+#define cublasDspmv cublasDspmv_v2
+#define cublasChpmv cublasChpmv_v2
+#define cublasZhpmv cublasZhpmv_v2
+
+#define cublasSger cublasSger_v2
+#define cublasDger cublasDger_v2
+#define cublasCgeru cublasCgeru_v2
+#define cublasCgerc cublasCgerc_v2
+#define cublasZgeru cublasZgeru_v2
+#define cublasZgerc cublasZgerc_v2
+
+#define cublasSsyr cublasSsyr_v2
+#define cublasDsyr cublasDsyr_v2
+#define cublasCsyr cublasCsyr_v2
+#define cublasZsyr cublasZsyr_v2
+#define cublasCher cublasCher_v2
+#define cublasZher cublasZher_v2
+
+#define cublasSspr cublasSspr_v2
+#define cublasDspr cublasDspr_v2
+#define cublasChpr cublasChpr_v2
+#define cublasZhpr cublasZhpr_v2
+
+#define cublasSsyr2 cublasSsyr2_v2
+#define cublasDsyr2 cublasDsyr2_v2
+#define cublasCsyr2 cublasCsyr2_v2
+#define cublasZsyr2 cublasZsyr2_v2
+#define cublasCher2 cublasCher2_v2
+#define cublasZher2 cublasZher2_v2
+
+#define cublasSspr2 cublasSspr2_v2
+#define cublasDspr2 cublasDspr2_v2
+#define cublasChpr2 cublasChpr2_v2
+#define cublasZhpr2 cublasZhpr2_v2
+
+/* Blas3 Routines   */
+
+#define cublasSgemm cublasSgemm_v2
+#define cublasDgemm cublasDgemm_v2
+#define cublasCgemm cublasCgemm_v2
+#define cublasZgemm cublasZgemm_v2
+
+#define cublasSsyrk cublasSsyrk_v2
+#define cublasDsyrk cublasDsyrk_v2
+#define cublasCsyrk cublasCsyrk_v2
+#define cublasZsyrk cublasZsyrk_v2
+#define cublasCherk cublasCherk_v2
+#define cublasZherk cublasZherk_v2
+
+#define cublasSsyr2k cublasSsyr2k_v2
+#define cublasDsyr2k cublasDsyr2k_v2
+#define cublasCsyr2k cublasCsyr2k_v2
+#define cublasZsyr2k cublasZsyr2k_v2
+#define cublasCher2k cublasCher2k_v2
+#define cublasZher2k cublasZher2k_v2
+
+#define cublasSsymm cublasSsymm_v2
+#define cublasDsymm cublasDsymm_v2
+#define cublasCsymm cublasCsymm_v2
+#define cublasZsymm cublasZsymm_v2
+#define cublasChemm cublasChemm_v2
+#define cublasZhemm cublasZhemm_v2
+
+#define cublasStrsm cublasStrsm_v2
+#define cublasDtrsm cublasDtrsm_v2
+#define cublasCtrsm cublasCtrsm_v2
+#define cublasZtrsm cublasZtrsm_v2
+
+#define cublasStrmm cublasStrmm_v2
+#define cublasDtrmm cublasDtrmm_v2
+#define cublasCtrmm cublasCtrmm_v2
+#define cublasZtrmm cublasZtrmm_v2
+
+/* 64-bit integer */
+
+/* Blas1 Routines   */
+
+#define cublasSnrm2_64 cublasSnrm2_v2_64
+#define cublasDnrm2_64 cublasDnrm2_v2_64
+#define cublasScnrm2_64 cublasScnrm2_v2_64
+#define cublasDznrm2_64 cublasDznrm2_v2_64
+
+#define cublasSdot_64 cublasSdot_v2_64
+#define cublasDdot_64 cublasDdot_v2_64
+#define cublasCdotu_64 cublasCdotu_v2_64
+#define cublasCdotc_64 cublasCdotc_v2_64
+#define cublasZdotu_64 cublasZdotu_v2_64
+#define cublasZdotc_64 cublasZdotc_v2_64
+
+#define cublasSscal_64 cublasSscal_v2_64
+#define cublasDscal_64 cublasDscal_v2_64
+#define cublasCscal_64 cublasCscal_v2_64
+#define cublasCsscal_64 cublasCsscal_v2_64
+#define cublasZscal_64 cublasZscal_v2_64
+#define cublasZdscal_64 cublasZdscal_v2_64
+
+#define cublasSaxpy_64 cublasSaxpy_v2_64
+#define cublasDaxpy_64 cublasDaxpy_v2_64
+#define cublasCaxpy_64 cublasCaxpy_v2_64
+#define cublasZaxpy_64 cublasZaxpy_v2_64
+
+#define cublasScopy_64 cublasScopy_v2_64
+#define cublasDcopy_64 cublasDcopy_v2_64
+#define cublasCcopy_64 cublasCcopy_v2_64
+#define cublasZcopy_64 cublasZcopy_v2_64
+
+#define cublasSswap_64 cublasSswap_v2_64
+#define cublasDswap_64 cublasDswap_v2_64
+#define cublasCswap_64 cublasCswap_v2_64
+#define cublasZswap_64 cublasZswap_v2_64
+
+#define cublasIsamax_64 cublasIsamax_v2_64
+#define cublasIdamax_64 cublasIdamax_v2_64
+#define cublasIcamax_64 cublasIcamax_v2_64
+#define cublasIzamax_64 cublasIzamax_v2_64
+
+#define cublasIsamin_64 cublasIsamin_v2_64
+#define cublasIdamin_64 cublasIdamin_v2_64
+#define cublasIcamin_64 cublasIcamin_v2_64
+#define cublasIzamin_64 cublasIzamin_v2_64
+
+#define cublasSasum_64 cublasSasum_v2_64
+#define cublasDasum_64 cublasDasum_v2_64
+#define cublasScasum_64 cublasScasum_v2_64
+#define cublasDzasum_64 cublasDzasum_v2_64
+
+#define cublasSrot_64 cublasSrot_v2_64
+#define cublasDrot_64 cublasDrot_v2_64
+#define cublasCrot_64 cublasCrot_v2_64
+#define cublasCsrot_64 cublasCsrot_v2_64
+#define cublasZrot_64 cublasZrot_v2_64
+#define cublasZdrot_64 cublasZdrot_v2_64
+
+#define cublasSrotg_64 cublasSrotg_v2_64
+#define cublasDrotg_64 cublasDrotg_v2_64
+#define cublasCrotg_64 cublasCrotg_v2_64
+#define cublasZrotg_64 cublasZrotg_v2_64
+
+#define cublasSrotm_64 cublasSrotm_v2_64
+#define cublasDrotm_64 cublasDrotm_v2_64
+
+#define cublasSrotmg_64 cublasSrotmg_v2_64
+#define cublasDrotmg_64 cublasDrotmg_v2_64
+
+/* Blas2 Routines */
+
+#define cublasSgemv_64 cublasSgemv_v2_64
+#define cublasDgemv_64 cublasDgemv_v2_64
+#define cublasCgemv_64 cublasCgemv_v2_64
+#define cublasZgemv_64 cublasZgemv_v2_64
+
+#define cublasSgbmv_64 cublasSgbmv_v2_64
+#define cublasDgbmv_64 cublasDgbmv_v2_64
+#define cublasCgbmv_64 cublasCgbmv_v2_64
+#define cublasZgbmv_64 cublasZgbmv_v2_64
+
+#define cublasStrmv_64 cublasStrmv_v2_64
+#define cublasDtrmv_64 cublasDtrmv_v2_64
+#define cublasCtrmv_64 cublasCtrmv_v2_64
+#define cublasZtrmv_64 cublasZtrmv_v2_64
+
+#define cublasStbmv_64 cublasStbmv_v2_64
+#define cublasDtbmv_64 cublasDtbmv_v2_64
+#define cublasCtbmv_64 cublasCtbmv_v2_64
+#define cublasZtbmv_64 cublasZtbmv_v2_64
+
+#define cublasStpmv_64 cublasStpmv_v2_64
+#define cublasDtpmv_64 cublasDtpmv_v2_64
+#define cublasCtpmv_64 cublasCtpmv_v2_64
+#define cublasZtpmv_64 cublasZtpmv_v2_64
+
+#define cublasStrsv_64 cublasStrsv_v2_64
+#define cublasDtrsv_64 cublasDtrsv_v2_64
+#define cublasCtrsv_64 cublasCtrsv_v2_64
+#define cublasZtrsv_64 cublasZtrsv_v2_64
+
+#define cublasStpsv_64 cublasStpsv_v2_64
+#define cublasDtpsv_64 cublasDtpsv_v2_64
+#define cublasCtpsv_64 cublasCtpsv_v2_64
+#define cublasZtpsv_64 cublasZtpsv_v2_64
+
+#define cublasStbsv_64 cublasStbsv_v2_64
+#define cublasDtbsv_64 cublasDtbsv_v2_64
+#define cublasCtbsv_64 cublasCtbsv_v2_64
+#define cublasZtbsv_64 cublasZtbsv_v2_64
+
+#define cublasSsymv_64 cublasSsymv_v2_64
+#define cublasDsymv_64 cublasDsymv_v2_64
+#define cublasCsymv_64 cublasCsymv_v2_64
+#define cublasZsymv_64 cublasZsymv_v2_64
+#define cublasChemv_64 cublasChemv_v2_64
+#define cublasZhemv_64 cublasZhemv_v2_64
+
+#define cublasSsbmv_64 cublasSsbmv_v2_64
+#define cublasDsbmv_64 cublasDsbmv_v2_64
+#define cublasChbmv_64 cublasChbmv_v2_64
+#define cublasZhbmv_64 cublasZhbmv_v2_64
+
+#define cublasSspmv_64 cublasSspmv_v2_64
+#define cublasDspmv_64 cublasDspmv_v2_64
+#define cublasChpmv_64 cublasChpmv_v2_64
+#define cublasZhpmv_64 cublasZhpmv_v2_64
+
+#define cublasSger_64 cublasSger_v2_64
+#define cublasDger_64 cublasDger_v2_64
+#define cublasCgeru_64 cublasCgeru_v2_64
+#define cublasCgerc_64 cublasCgerc_v2_64
+#define cublasZgeru_64 cublasZgeru_v2_64
+#define cublasZgerc_64 cublasZgerc_v2_64
+
+#define cublasSsyr_64 cublasSsyr_v2_64
+#define cublasDsyr_64 cublasDsyr_v2_64
+#define cublasCsyr_64 cublasCsyr_v2_64
+#define cublasZsyr_64 cublasZsyr_v2_64
+#define cublasCher_64 cublasCher_v2_64
+#define cublasZher_64 cublasZher_v2_64
+
+#define cublasSspr_64 cublasSspr_v2_64
+#define cublasDspr_64 cublasDspr_v2_64
+#define cublasChpr_64 cublasChpr_v2_64
+#define cublasZhpr_64 cublasZhpr_v2_64
+
+#define cublasSsyr2_64 cublasSsyr2_v2_64
+#define cublasDsyr2_64 cublasDsyr2_v2_64
+#define cublasCsyr2_64 cublasCsyr2_v2_64
+#define cublasZsyr2_64 cublasZsyr2_v2_64
+#define cublasCher2_64 cublasCher2_v2_64
+#define cublasZher2_64 cublasZher2_v2_64
+
+#define cublasSspr2_64 cublasSspr2_v2_64
+#define cublasDspr2_64 cublasDspr2_v2_64
+#define cublasChpr2_64 cublasChpr2_v2_64
+#define cublasZhpr2_64 cublasZhpr2_v2_64
+
+/* Blas3 Routines   */
+
+#define cublasSgemm_64 cublasSgemm_v2_64
+#define cublasDgemm_64 cublasDgemm_v2_64
+#define cublasCgemm_64 cublasCgemm_v2_64
+#define cublasZgemm_64 cublasZgemm_v2_64
+
+#define cublasSsyrk_64 cublasSsyrk_v2_64
+#define cublasDsyrk_64 cublasDsyrk_v2_64
+#define cublasCsyrk_64 cublasCsyrk_v2_64
+#define cublasZsyrk_64 cublasZsyrk_v2_64
+#define cublasCherk_64 cublasCherk_v2_64
+#define cublasZherk_64 cublasZherk_v2_64
+
+#define cublasSsyr2k_64 cublasSsyr2k_v2_64
+#define cublasDsyr2k_64 cublasDsyr2k_v2_64
+#define cublasCsyr2k_64 cublasCsyr2k_v2_64
+#define cublasZsyr2k_64 cublasZsyr2k_v2_64
+#define cublasCher2k_64 cublasCher2k_v2_64
+#define cublasZher2k_64 cublasZher2k_v2_64
+
+#define cublasSsymm_64 cublasSsymm_v2_64
+#define cublasDsymm_64 cublasDsymm_v2_64
+#define cublasCsymm_64 cublasCsymm_v2_64
+#define cublasZsymm_64 cublasZsymm_v2_64
+#define cublasChemm_64 cublasChemm_v2_64
+#define cublasZhemm_64 cublasZhemm_v2_64
+
+#define cublasStrsm_64 cublasStrsm_v2_64
+#define cublasDtrsm_64 cublasDtrsm_v2_64
+#define cublasCtrsm_64 cublasCtrsm_v2_64
+#define cublasZtrsm_64 cublasZtrsm_v2_64
+
+#define cublasStrmm_64 cublasStrmm_v2_64
+#define cublasDtrmm_64 cublasDtrmm_v2_64
+#define cublasCtrmm_64 cublasCtrmm_v2_64
+#define cublasZtrmm_64 cublasZtrmm_v2_64
+
+#endif /* !defined(CUBLAS_V2_H_) */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h
new file mode 100644
index 0000000000000000000000000000000000000000..29ea9153faf7b3e62a6d53c0be1980ae79c49f51
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h
@@ -0,0 +1,824 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(NVBLAS_H_)
+#define NVBLAS_H_
+
+#include "driver_types.h"
+#include "cuComplex.h" /* import complex data type */
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* GEMM */
+void sgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* b,
+            const int* ldb,
+            const float* beta,
+            float* c,
+            const int* ldc);
+
+void dgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* b,
+            const int* ldb,
+            const double* beta,
+            double* c,
+            const int* ldc);
+
+void cgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+void sgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           const float* b,
+           const int* ldb,
+           const float* beta,
+           float* c,
+           const int* ldc);
+
+void dgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           const double* b,
+           const int* ldb,
+           const double* beta,
+           double* c,
+           const int* ldc);
+
+void cgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* b,
+           const int* ldb,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+
+void zgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* b,
+           const int* ldb,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+
+/* SYRK */
+void ssyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* beta,
+            float* c,
+            const int* ldc);
+
+void dsyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* beta,
+            double* c,
+            const int* ldc);
+
+void csyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zsyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+void ssyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           const float* beta,
+           float* c,
+           const int* ldc);
+
+void dsyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           const double* beta,
+           double* c,
+           const int* ldc);
+
+void csyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+
+void zsyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+
+/* HERK */
+void cherk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const float* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zherk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const double* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+void cherk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const float* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const float* beta,
+           cuComplex* c,
+           const int* ldc);
+
+void zherk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const double* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const double* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+
+/* TRSM */
+void strsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            float* b,
+            const int* ldb);
+
+void dtrsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            double* b,
+            const int* ldb);
+
+void ctrsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            cuComplex* b,
+            const int* ldb);
+
+void ztrsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            cuDoubleComplex* b,
+            const int* ldb);
+
+void strsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           float* b,
+           const int* ldb);
+
+void dtrsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           double* b,
+           const int* ldb);
+
+void ctrsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           cuComplex* b,
+           const int* ldb);
+
+void ztrsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           cuDoubleComplex* b,
+           const int* ldb);
+
+/* SYMM */
+void ssymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* b,
+            const int* ldb,
+            const float* beta,
+            float* c,
+            const int* ldc);
+
+void dsymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* b,
+            const int* ldb,
+            const double* beta,
+            double* c,
+            const int* ldc);
+
+void csymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zsymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+void ssymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           const float* b,
+           const int* ldb,
+           const float* beta,
+           float* c,
+           const int* ldc);
+
+void dsymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           const double* b,
+           const int* ldb,
+           const double* beta,
+           double* c,
+           const int* ldc);
+
+void csymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* b,
+           const int* ldb,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+
+void zsymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* b,
+           const int* ldb,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+
+/* HEMM */
+void chemm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zhemm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+/* HEMM with no underscore*/
+void chemm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* b,
+           const int* ldb,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+
+void zhemm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* b,
+           const int* ldb,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+
+/* SYR2K */
+void ssyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const float* alpha,
+             const float* a,
+             const int* lda,
+             const float* b,
+             const int* ldb,
+             const float* beta,
+             float* c,
+             const int* ldc);
+
+void dsyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const double* alpha,
+             const double* a,
+             const int* lda,
+             const double* b,
+             const int* ldb,
+             const double* beta,
+             double* c,
+             const int* ldc);
+
+void csyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuComplex* alpha,
+             const cuComplex* a,
+             const int* lda,
+             const cuComplex* b,
+             const int* ldb,
+             const cuComplex* beta,
+             cuComplex* c,
+             const int* ldc);
+
+void zsyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuDoubleComplex* alpha,
+             const cuDoubleComplex* a,
+             const int* lda,
+             const cuDoubleComplex* b,
+             const int* ldb,
+             const cuDoubleComplex* beta,
+             cuDoubleComplex* c,
+             const int* ldc);
+
+/* SYR2K no_underscore*/
+void ssyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* b,
+            const int* ldb,
+            const float* beta,
+            float* c,
+            const int* ldc);
+
+void dsyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* b,
+            const int* ldb,
+            const double* beta,
+            double* c,
+            const int* ldc);
+
+void csyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zsyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+/* HERK */
+void cher2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuComplex* alpha,
+             const cuComplex* a,
+             const int* lda,
+             const cuComplex* b,
+             const int* ldb,
+             const float* beta,
+             cuComplex* c,
+             const int* ldc);
+
+void zher2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuDoubleComplex* alpha,
+             const cuDoubleComplex* a,
+             const int* lda,
+             const cuDoubleComplex* b,
+             const int* ldb,
+             const double* beta,
+             cuDoubleComplex* c,
+             const int* ldc);
+
+/* HER2K with no underscore */
+void cher2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const float* beta,
+            cuComplex* c,
+            const int* ldc);
+
+void zher2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const double* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+
+/* TRMM */
+void strmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            float* b,
+            const int* ldb);
+
+void dtrmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            double* b,
+            const int* ldb);
+
+void ctrmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            cuComplex* b,
+            const int* ldb);
+
+void ztrmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            cuDoubleComplex* b,
+            const int* ldb);
+
+void strmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           float* b,
+           const int* ldb);
+
+void dtrmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           double* b,
+           const int* ldb);
+
+void ctrmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           cuComplex* b,
+           const int* ldb);
+
+void ztrmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           cuDoubleComplex* b,
+           const int* ldb);
+
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+
+#endif /* !defined(NVBLAS_H_) */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c45d0aa00d2061228f50bd66e0f0769a9e8a5ef6
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.12 b/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.12
new file mode 100644
index 0000000000000000000000000000000000000000..7dc8c88191367bee01e752de108445a72c669208
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.12
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7c2a58dc54154208392301d0fe3d53a120e4c1ebeab9e80ce91fe9948baeadc9
+size 757496
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..62ddebfa6c7a0b463facf5b53350f2a6554ed333
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd2af756dec150094c29e9a777dbe4fd43d31c85
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd9dbf51efe8d0803ce470bbb2b90ab21ad993d5
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h
@@ -0,0 +1,869 @@
+//
+// NVIDIA_COPYRIGHT_BEGIN
+//
+// Copyright (c) 2014-2023, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+//
+// NVIDIA_COPYRIGHT_END
+//
+
+#ifndef __NVRTC_H__
+#define __NVRTC_H__
+
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+
+#include <stdlib.h>
+
+
+/*************************************************************************//**
+ *
+ * \defgroup error Error Handling
+ *
+ * NVRTC defines the following enumeration type and function for API call
+ * error handling.
+ *
+ ****************************************************************************/
+
+
+/**
+ * \ingroup error
+ * \brief   The enumerated type nvrtcResult defines API call result codes.
+ *          NVRTC API functions return nvrtcResult to indicate the call
+ *          result.
+ */
+typedef enum {
+  NVRTC_SUCCESS = 0,
+  NVRTC_ERROR_OUT_OF_MEMORY = 1,
+  NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  NVRTC_ERROR_INVALID_INPUT = 3,
+  NVRTC_ERROR_INVALID_PROGRAM = 4,
+  NVRTC_ERROR_INVALID_OPTION = 5,
+  NVRTC_ERROR_COMPILATION = 6,
+  NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  NVRTC_ERROR_INTERNAL_ERROR = 11,
+  NVRTC_ERROR_TIME_FILE_WRITE_FAILED = 12
+} nvrtcResult;
+
+
+/**
+ * \ingroup error
+ * \brief   nvrtcGetErrorString is a helper function that returns a string
+ *          describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to
+ *          \c "NVRTC_SUCCESS".
+ *          For unrecognized enumeration values, it returns
+ *          \c "NVRTC_ERROR unknown".
+ *
+ * \param   [in] result CUDA Runtime Compilation API result code.
+ * \return  Message string for the given #nvrtcResult code.
+ */
+const char *nvrtcGetErrorString(nvrtcResult result);
+
+
+/*************************************************************************//**
+ *
+ * \defgroup query General Information Query
+ *
+ * NVRTC defines the following function for general information query.
+ *
+ ****************************************************************************/
+
+
+/**
+ * \ingroup query
+ * \brief   nvrtcVersion sets the output parameters \p major and \p minor
+ *          with the CUDA Runtime Compilation version number.
+ *
+ * \param   [out] major CUDA Runtime Compilation major version number.
+ * \param   [out] minor CUDA Runtime Compilation minor version number.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *
+ */
+nvrtcResult nvrtcVersion(int *major, int *minor);
+
+
+/**
+ * \ingroup query
+ * \brief   nvrtcGetNumSupportedArchs sets the output parameter \p numArchs 
+ *          with the number of architectures supported by NVRTC. This can 
+ *          then be used to pass an array to ::nvrtcGetSupportedArchs to
+ *          get the supported architectures.
+ *
+ * \param   [out] numArchs number of supported architectures.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *
+ * see    ::nvrtcGetSupportedArchs
+ */
+nvrtcResult nvrtcGetNumSupportedArchs(int* numArchs);
+
+
+/**
+ * \ingroup query
+ * \brief   nvrtcGetSupportedArchs populates the array passed via the output parameter 
+ *          \p supportedArchs with the architectures supported by NVRTC. The array is
+ *          sorted in the ascending order. The size of the array to be passed can be
+ *          determined using ::nvrtcGetNumSupportedArchs.
+ *
+ * \param   [out] supportedArchs sorted array of supported architectures.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *
+ * see    ::nvrtcGetNumSupportedArchs
+ */
+nvrtcResult nvrtcGetSupportedArchs(int* supportedArchs);
+
+
+/*************************************************************************//**
+ *
+ * \defgroup compilation Compilation
+ *
+ * NVRTC defines the following type and functions for actual compilation.
+ *
+ ****************************************************************************/
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcProgram is the unit of compilation, and an opaque handle for
+ *          a program.
+ *
+ * To compile a CUDA program string, an instance of nvrtcProgram must be
+ * created first with ::nvrtcCreateProgram, then compiled with
+ * ::nvrtcCompileProgram.
+ */
+typedef struct _nvrtcProgram *nvrtcProgram;
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcCreateProgram creates an instance of nvrtcProgram with the
+ *          given input parameters, and sets the output parameter \p prog with
+ *          it.
+ *
+ * \param   [out] prog         CUDA Runtime Compilation program.
+ * \param   [in]  src          CUDA program source.
+ * \param   [in]  name         CUDA program name.\n
+ *                             \p name can be \c NULL; \c "default_program" is
+ *                             used when \p name is \c NULL or "".
+ * \param   [in]  numHeaders   Number of headers used.\n
+ *                             \p numHeaders must be greater than or equal to 0.
+ * \param   [in]  headers      Sources of the headers.\n
+ *                             \p headers can be \c NULL when \p numHeaders is
+ *                             0.
+ * \param   [in]  includeNames Name of each header by which they can be
+ *                             included in the CUDA program source.\n
+ *                             \p includeNames can be \c NULL when \p numHeaders
+ *                             is 0. These headers must be included with the exact
+ *                             names specified here.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcDestroyProgram
+ */
+nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
+                               const char *src,
+                               const char *name,
+                               int numHeaders,
+                               const char * const *headers,
+                               const char * const *includeNames);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcDestroyProgram destroys the given program.
+ *
+ * \param    [in] prog CUDA Runtime Compilation program.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcCreateProgram
+ */
+nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcCompileProgram compiles the given program.
+ *
+ * \param   [in] prog       CUDA Runtime Compilation program.
+ * \param   [in] numOptions Number of compiler options passed.
+ * \param   [in] options    Compiler options in the form of C string array.\n
+ *                          \p options can be \c NULL when \p numOptions is 0.
+ *
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_TIME_FILE_WRITE_FAILED \endlink
+ *
+ * It supports compile options listed in \ref options.
+ */
+nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
+                                int numOptions, const char * const *options);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetPTXSize sets the value of \p ptxSizeRet with the size of the PTX
+ *          generated by the previous compilation of \p prog (including the
+ *          trailing \c NULL).
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] ptxSizeRet Size of the generated PTX (including the trailing
+ *                           \c NULL).
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetPTX
+ */
+nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetPTX stores the PTX generated by the previous compilation
+ *          of \p prog in the memory pointed by \p ptx.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] ptx  Compiled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetPTXSize
+ */
+nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetCUBINSize sets the value of \p cubinSizeRet with the size of the cubin
+ *          generated by the previous compilation of \p prog. The value of
+ *          cubinSizeRet is set to 0 if the value specified to \c -arch is a
+ *          virtual architecture instead of an actual architecture.
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] cubinSizeRet Size of the generated cubin.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetCUBIN
+ */
+nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t *cubinSizeRet);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetCUBIN stores the cubin generated by the previous compilation
+ *          of \p prog in the memory pointed by \p cubin. No cubin is available
+ *          if the value specified to \c -arch is a virtual architecture instead
+ *          of an actual architecture.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] cubin  Compiled and assembled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetCUBINSize
+ */
+nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
+
+
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#elif (defined(__GNUC__))
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#else
+# define __DEPRECATED__(msg)
+#endif
+
+/**
+ * \ingroup compilation
+ * \brief   
+ * DEPRECATION NOTICE: This function will be removed in a future release. Please use
+ * nvrtcGetLTOIRSize (and nvrtcGetLTOIR) instead.
+ */
+__DEPRECATED__("This function will be removed in a future release. Please use nvrtcGetLTOIRSize instead")
+nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t *nvvmSizeRet);
+
+/**
+ * \ingroup compilation
+ * \brief   
+ * DEPRECATION NOTICE: This function will be removed in a future release. Please use
+ * nvrtcGetLTOIR (and nvrtcGetLTOIRSize) instead.
+ */
+__DEPRECATED__("This function will be removed in a future release. Please use nvrtcGetLTOIR instead")
+nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char *nvvm);
+
+#undef __DEPRECATED__
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetLTOIRSize sets the value of \p LTOIRSizeRet with the size of the LTO IR
+ *          generated by the previous compilation of \p prog. The value of
+ *          LTOIRSizeRet is set to 0 if the program was not compiled with 
+ *          \c -dlto.
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] LTOIRSizeRet Size of the generated LTO IR.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetLTOIR
+ */
+nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *LTOIRSizeRet);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetLTOIR stores the LTO IR generated by the previous compilation
+ *          of \p prog in the memory pointed by \p LTOIR. No LTO IR is available
+ *          if the program was compiled without \c -dlto.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] LTOIR Compiled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetLTOIRSize
+ */
+nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *LTOIR);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetOptiXIRSize sets the value of \p optixirSizeRet with the size of the OptiX IR
+ *          generated by the previous compilation of \p prog. The value of
+ *          nvrtcGetOptiXIRSize is set to 0 if the program was compiled with 
+ *          options incompatible with OptiX IR generation.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] optixirSizeRet Size of the generated LTO IR.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetOptiXIR
+ */
+nvrtcResult nvrtcGetOptiXIRSize(nvrtcProgram prog, size_t *optixirSizeRet);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetOptiXIR stores the OptiX IR generated by the previous compilation
+ *          of \p prog in the memory pointed by \p optixir. No OptiX IR is available
+ *          if the program was compiled with options incompatible with OptiX IR generation.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] Optix IR Compiled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetOptiXIRSize
+ */
+nvrtcResult nvrtcGetOptiXIR(nvrtcProgram prog, char *optixir);
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
+ *          log generated by the previous compilation of \p prog (including the
+ *          trailing \c NULL).
+ *
+ * Note that compilation log may be generated with warnings and informative
+ * messages, even when the compilation of \p prog succeeds.
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] logSizeRet Size of the compilation log
+ *                           (including the trailing \c NULL).
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetProgramLog
+ */
+nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetProgramLog stores the log generated by the previous
+ *          compilation of \p prog in the memory pointed by \p log.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] log  Compilation log.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetProgramLogSize
+ */
+nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
+
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcAddNameExpression notes the given name expression
+ *          denoting the address of a __global__ function 
+ *          or __device__/__constant__ variable.
+ *
+ * The identical name expression string must be provided on a subsequent
+ * call to nvrtcGetLoweredName to extract the lowered name.
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [in] name_expression constant expression denoting the address of
+ *               a __global__ function or __device__/__constant__ variable.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink
+ *
+ * \see     ::nvrtcGetLoweredName
+ */
+nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog,
+                                   const char * const name_expression);
+
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetLoweredName extracts the lowered (mangled) name
+ *          for a __global__ function or __device__/__constant__ variable,
+ *          and updates *lowered_name to point to it. The memory containing
+ *          the name is released when the NVRTC program is destroyed by 
+ *          nvrtcDestroyProgram.
+ *          The identical name expression must have been previously
+ *          provided to nvrtcAddNameExpression.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [in] name_expression constant expression denoting the address of 
+ *               a __global__ function or __device__/__constant__ variable.
+ * \param   [out] lowered_name initialized by the function to point to a
+ *               C string containing the lowered (mangled)
+ *               name corresponding to the provided name expression.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink
+ *
+ * \see     ::nvrtcAddNameExpression
+ */
+nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog,
+                                const char *const name_expression,
+                                const char** lowered_name);
+
+
+/**
+ * \defgroup options Supported Compile Options
+ *
+ * NVRTC supports the compile options below.
+ * Option names with two preceding dashs (\c --) are long option names and
+ * option names with one preceding dash (\c -) are short option names.
+ * Short option names can be used instead of long option names.
+ * When a compile option takes an argument, an assignment operator (\c =)
+ * is used to separate the compile option argument from the compile option
+ * name, e.g., \c "--gpu-architecture=compute_60".
+ * Alternatively, the compile option name and the argument can be specified in
+ * separate strings without an assignment operator, .e.g,
+ * \c "--gpu-architecture" \c "compute_60".
+ * Single-character short option names, such as \c -D, \c -U, and \c -I, do
+ * not require an assignment operator, and the compile option name and the
+ * argument can be present in the same string with or without spaces between
+ * them.
+ * For instance, \c "-D=<def>", \c "-D<def>", and \c "-D <def>" are all
+ * supported.
+ *
+ * The valid compiler options are:
+ *
+ *   - Compilation targets
+ *     - \c --gpu-architecture=\<arch\> (\c -arch)\n
+ *       Specify the name of the class of GPU architectures for which the
+ *       input must be compiled.\n
+ *       - Valid <c>\<arch\></c>s:
+ *         - \c compute_50
+ *         - \c compute_52
+ *         - \c compute_53
+ *         - \c compute_60
+ *         - \c compute_61
+ *         - \c compute_62
+ *         - \c compute_70
+ *         - \c compute_72
+ *         - \c compute_75
+ *         - \c compute_80
+ *         - \c compute_87
+ *         - \c compute_89
+ *         - \c compute_90
+ *         - \c compute_90a
+ *         - \c sm_50
+ *         - \c sm_52
+ *         - \c sm_53
+ *         - \c sm_60
+ *         - \c sm_61
+ *         - \c sm_62
+ *         - \c sm_70
+ *         - \c sm_72
+ *         - \c sm_75
+ *         - \c sm_80
+ *         - \c sm_87
+ *         - \c sm_89
+ *         - \c sm_90
+ *         - \c sm_90a
+ *       - Default: \c compute_52
+ *   - Separate compilation / whole-program compilation
+ *     - \c --device-c (\c -dc)\n
+ *       Generate relocatable code that can be linked with other relocatable
+ *       device code.  It is equivalent to --relocatable-device-code=true.
+ *     - \c --device-w (\c -dw)\n
+ *       Generate non-relocatable code.  It is equivalent to
+ *       \c --relocatable-device-code=false.
+ *     - \c --relocatable-device-code={true|false} (\c -rdc)\n
+ *       Enable (disable) the generation of relocatable device code.
+ *       - Default: \c false
+ *     - \c --extensible-whole-program (\c -ewp)\n
+ *       Do extensible whole program compilation of device code.
+ *       - Default: \c false
+ *   - Debugging support
+ *     - \c --device-debug (\c -G)\n
+ *       Generate debug information. If --dopt is not specified, 
+ *       then turns off all optimizations.
+ *     - \c --generate-line-info (\c -lineinfo)\n
+ *       Generate line-number information.
+ *   - Code generation
+ *     - \c --dopt on (\c -dopt)\n
+ *     - \c --dopt=on \n
+ *       Enable device code optimization. When specified along with '-G', enables
+ *       limited debug information generation for optimized device code (currently,
+ *       only line number information).
+ *       When '-G' is not specified, '-dopt=on' is implicit.
+ *     - \c --ptxas-options \<options\> (\c -Xptxas)\n
+ *     - \c --ptxas-options=\<options\> \n
+ *       Specify options directly to ptxas, the PTX optimizing assembler.
+ *     - \c --maxrregcount=\<N\> (\c -maxrregcount)\n
+ *       Specify the maximum amount of registers that GPU functions can use.
+ *       Until a function-specific limit, a higher value will generally
+ *       increase the performance of individual GPU threads that execute this
+ *       function.  However, because thread registers are allocated from a
+ *       global register pool on each GPU, a higher value of this option will
+ *       also reduce the maximum thread block size, thereby reducing the amount
+ *       of thread parallelism.  Hence, a good maxrregcount value is the result
+ *       of a trade-off.  If this option is not specified, then no maximum is
+ *       assumed.  Value less than the minimum registers required by ABI will
+ *       be bumped up by the compiler to ABI minimum limit.
+ *     - \c --ftz={true|false} (\c -ftz)\n
+ *       When performing single-precision floating-point operations, flush
+ *       denormal values to zero or preserve denormal values.
+ *       \c --use_fast_math implies \c --ftz=true.
+ *       - Default: \c false
+ *     - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n
+ *       For single-precision floating-point square root, use IEEE
+ *       round-to-nearest mode or use a faster approximation.
+ *       \c --use_fast_math implies \c --prec-sqrt=false.
+ *       - Default: \c true
+ *     - \c --prec-div={true|false} (\c -prec-div)\n
+ *       For single-precision floating-point division and reciprocals, use IEEE
+ *       round-to-nearest mode or use a faster approximation.
+ *       \c --use_fast_math implies \c --prec-div=false.
+ *       - Default: \c true
+ *     - \c --fmad={true|false} (\c -fmad)\n
+ *       Enables (disables) the contraction of floating-point multiplies and
+ *       adds/subtracts into floating-point multiply-add operations (FMAD,
+ *       FFMA, or DFMA).  \c --use_fast_math implies \c --fmad=true.
+ *       - Default: \c true
+ *     - \c --use_fast_math (\c -use_fast_math)\n
+ *       Make use of fast math operations.
+ *       \c --use_fast_math implies \c --ftz=true \c --prec-div=false
+ *       \c --prec-sqrt=false \c --fmad=true.
+ *     - \c --extra-device-vectorization (\c -extra-device-vectorization)\n
+ *       Enables more aggressive device code vectorization in the NVVM optimizer.
+ *     - \c --modify-stack-limit={true|false} (\c -modify-stack-limit)\n
+ *       On Linux, during compilation, use \c setrlimit() to increase stack size 
+ *       to maximum allowed. The limit is reset to the previous value at the
+ *       end of compilation.
+ *       Note: \c setrlimit() changes the value for the entire process.
+ *       - Default: \c true
+ *     - \c --dlink-time-opt (\c -dlto)\n
+ *       Generate intermediate code for later link-time optimization.
+ *       It implies \c -rdc=true. 
+ *       Note: when this option is used the nvrtcGetLTOIR API should be used, 
+ *       as PTX or Cubin will not be generated.
+ *     - \c --gen-opt-lto (\c -gen-opt-lto)\n
+ *       Run the optimizer passes before generating the LTO IR.
+ *     - \c --optix-ir (\c -optix-ir)\n
+ *       Generate OptiX IR. The Optix IR is only intended for consumption by OptiX
+ *       through appropriate APIs. This feature is not supported with 
+ *       link-time-optimization (\c -dlto)\n.
+ *       Note: when this option is used the nvrtcGetOptiX API should be used, 
+ *       as PTX or Cubin will not be generated.
+ *     - \c --jump-table-density=[0-101] (\c -jtd)\n
+ *       Specify the case density percentage in switch statements, and use it as
+ *       a minimal threshold to determine whether jump table(brx.idx instruction)
+ *       will be used to implement a switch statement. Default value is 101. The
+ *       percentage ranges from 0 to 101 inclusively.
+ *   - Preprocessing
+ *     - \c --define-macro=\<def\> (\c -D)\n
+ *       \c \<def\> can be either \c \<name\> or \c \<name=definitions\>.
+ *       - \c \<name\> \n
+ *         Predefine \c \<name\> as a macro with definition \c 1.
+ *       - \c \<name\>=\<definition\> \n
+ *         The contents of \c \<definition\> are tokenized and preprocessed
+ *         as if they appeared during translation phase three in a \c \#define
+ *         directive.  In particular, the definition will be truncated by
+ *         embedded new line characters.
+ *     - \c --undefine-macro=\<def\> (\c -U)\n
+ *       Cancel any previous definition of \c \<def\>.
+ *     - \c --include-path=\<dir\> (\c -I)\n
+ *       Add the directory \c \<dir\> to the list of directories to be
+ *       searched for headers.  These paths are searched after the list of
+ *       headers given to ::nvrtcCreateProgram.
+ *     - \c --pre-include=\<header\> (\c -include)\n
+ *       Preinclude \c \<header\> during preprocessing.
+ *     - \c --no-source-include (\c -no-source-include)
+ *       The preprocessor by default adds the directory of each input sources
+ *       to the include path. This option disables this feature and only
+ *       considers the path specified explicitly.
+ *   - Language Dialect
+ *     - \c --std={c++03|c++11|c++14|c++17|c++20}
+ *       (\c -std={c++11|c++14|c++17|c++20})\n
+ *       Set language dialect to C++03, C++11, C++14, C++17 or C++20
+ *       - Default: \c c++17
+ *     - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n
+ *       Provide builtin definitions of \c std::move and \c std::forward,
+ *       when C++11 or later language dialect is selected.
+ *       - Default: \c true
+ *     - \c --builtin-initializer-list={true|false}
+ *       (\c -builtin-initializer-list)\n
+ *       Provide builtin definitions of \c std::initializer_list class and
+ *       member functions when C++11 or later language dialect is selected.
+ *       - Default: \c true
+ *   - Misc.
+ *     - \c --disable-warnings (\c -w)\n
+ *       Inhibit all warning messages.
+ *     - \c --restrict (\c -restrict)\n
+ *       Programmer assertion that all kernel pointer parameters are restrict
+ *       pointers.
+ *     - \c --device-as-default-execution-space
+ *       (\c -default-device)\n
+ *       Treat entities with no execution space annotation as \c __device__
+ *       entities.
+ *     - \c --device-int128 (\c -device-int128)\n
+ *       Allow the \c __int128 type in device code. Also causes the macro \c __CUDACC_RTC_INT128__
+ *       to be defined.
+ *     - \c --optimization-info=\<kind\> (\c -opt-info)\n
+ *       Provide optimization reports for the specified kind of optimization.
+ *       The following kind tags are supported:
+ *         - \c inline : emit a remark when a function is inlined.
+ *     - \c --display-error-number (\c -err-no)\n
+ *       Display diagnostic number for warning messages. (Default)
+ *     - \c --no-display-error-number (\c -no-err-no)\n
+ *       Disables the display of a diagnostic number for warning messages.
+ *     - \c --diag-error=<error-number>,... (\c -diag-error)\n
+ *       Emit error for specified diagnostic message number(s). Message numbers can be separated by comma.
+ *     - \c --diag-suppress=<error-number>,... (\c -diag-suppress)\n
+ *       Suppress specified diagnostic message number(s). Message numbers can be separated by comma.
+ *     - \c --diag-warn=<error-number>,... (\c -diag-warn)\n
+ *       Emit warning for specified diagnostic message number(s). Message numbers can be separated by comma.
+ *     - \c --brief-diagnostics={true|false}  (\c -brief-diag)\n
+ *       This option disables or enables showing source line and column info 
+ *       in a diagnostic.
+ *       The --brief-diagnostics=true will not show the source line and column info.
+ *       - Default: \c false
+ *     - \c --time=<file-name> (\c -time)\n
+ *        Generate a comma separated value table with the time taken by each compilation
+ *        phase, and append it at the end of the file given as the option argument.
+ *       If the file does not exist, the column headings are generated in the first row
+ *       of the table. If the file name is '-', the timing data is written to the compilation log.
+ *     - \c --split-compile=<number of threads> (\c -split-compile=<number of threads>)\n
+ *       Perform compiler optimizations in parallel.
+ *       Split compilation attempts to reduce compile time by enabling the compiler to run certain
+ *       optimization passes concurrently. This option accepts a numerical value that specifies the
+ *       maximum number of threads the compiler can use. One can also allow the compiler to use the maximum
+ *       threads available on the system by setting --split-compile=0.
+ *       Setting --split-compile=1 will cause this option to be ignored.
+ *     - \c --fdevice-syntax-only (\c -fdevice-syntax-only)\n
+ *       Ends device compilation after front-end syntax checking. This option does not generate valid
+ *       device code.
+ *     - \c --minimal  (\c -minimal)\n
+ *        Omit certain language features to reduce compile time for small programs. 
+ *        In particular, the following are omitted: 
+ *            - Texture and surface functions and associated types, e.g., \c cudaTextureObject_t.
+ *            - CUDA Runtime Functions that are provided by the cudadevrt device code library, 
+ *              typically named with prefix "cuda", e.g., \c cudaMalloc.
+ *            - Kernel launch from device code.
+ *            - Types and macros associated with CUDA Runtime and Driver APIs, 
+ *              provided by cuda/tools/cudart/driver_types.h, typically named with prefix "cuda", e.g., \c cudaError_t.
+ *
+ */
+
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+
+
+/* The utility function 'nvrtcGetTypeName' is not available by default. Define
+   the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available.
+*/
+   
+#if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__
+
+#if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__
+#include <cxxabi.h>
+#include <cstdlib>
+
+#elif defined(_WIN32)
+#include <Windows.h>
+#include <DbgHelp.h>
+#endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */
+
+
+#include <string>
+#include <typeinfo>
+
+template <typename T> struct __nvrtcGetTypeName_helper_t { };
+
+/*************************************************************************//**
+ *
+ * \defgroup hosthelper Host Helper
+ *
+ * NVRTC defines the following functions for easier interaction with host code.
+ *
+ ****************************************************************************/
+
+/**
+ * \ingroup hosthelper
+ * \brief   nvrtcGetTypeName stores the source level name of a type in the given 
+ *          std::string location. 
+ *
+ * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
+ * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
+ * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
+ * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
+ * otherwise *result is initialized with the extracted name.
+ * 
+ * Windows-specific notes:
+ * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(), 
+ *   which is not multi-thread safe.
+ * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
+ *
+ * \param   [in] tinfo: reference to object of type std::type_info for a given type.
+ * \param   [in] result: pointer to std::string in which to store the type name.
+ * \return
+ *  - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *  - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
+ *
+ */
+inline nvrtcResult nvrtcGetTypeName(const std::type_info &tinfo, std::string *result)
+{
+#if USE_CXXABI || __clang__ || __GNUC__
+  const char *name = tinfo.name();
+  int status;
+  char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status);
+  if (status == 0) {
+    *result = undecorated_name;
+    free(undecorated_name);
+    return NVRTC_SUCCESS;
+  }
+#elif defined(_WIN32)
+  const char *name = tinfo.raw_name();
+  if (!name || *name != '.') {
+    return NVRTC_ERROR_INTERNAL_ERROR;
+  }
+  char undecorated_name[4096];
+  //name+1 skips over the '.' prefix
+  if(UnDecorateSymbolName(name+1, undecorated_name,
+                          sizeof(undecorated_name) / sizeof(*undecorated_name),
+                           //note: doesn't seem to work correctly without UNDNAME_NO_ARGUMENTS.
+                           UNDNAME_NO_ARGUMENTS | UNDNAME_NAME_ONLY ) ) {
+    *result = undecorated_name;
+    return NVRTC_SUCCESS;
+  }
+#endif  /* USE_CXXABI || __clang__ || __GNUC__ */
+
+  return NVRTC_ERROR_INTERNAL_ERROR;
+}
+
+/**
+ * \ingroup hosthelper
+ * \brief   nvrtcGetTypeName stores the source level name of the template type argument
+ *          T in the given std::string location.
+ *
+ * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
+ * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
+ * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
+ * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
+ * otherwise *result is initialized with the extracted name.
+ * 
+ * Windows-specific notes:
+ * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(), 
+ *   which is not multi-thread safe.
+ * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
+ *
+ * \param   [in] result: pointer to std::string in which to store the type name.
+ * \return
+ *  - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *  - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
+ *
+ */
+ 
+template <typename T>
+nvrtcResult nvrtcGetTypeName(std::string *result)
+{
+  nvrtcResult res = nvrtcGetTypeName(typeid(__nvrtcGetTypeName_helper_t<T>), 
+                                     result);
+  if (res != NVRTC_SUCCESS) 
+    return res;
+
+  std::string repr = *result;
+  std::size_t idx = repr.find("__nvrtcGetTypeName_helper_t");
+  idx = (idx != std::string::npos) ? repr.find("<", idx) : idx;
+  std::size_t last_idx = repr.find_last_of('>');
+  if (idx == std::string::npos || last_idx == std::string::npos) {
+    return NVRTC_ERROR_INTERNAL_ERROR;
+  }
+  ++idx;
+  *result = repr.substr(idx, last_idx - idx);
+  return NVRTC_SUCCESS;
+}
+
+#endif  /* NVRTC_GET_TYPE_NAME */
+
+#endif /* __NVRTC_H__ */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e04729f15a3b8ddf20f7a48ab8f77c6581f032e7
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28bd84c3870eef18edb05fb58247a276b91d6b04
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/async.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/async.h
new file mode 100644
index 0000000000000000000000000000000000000000..1b7dcb2433f2cb7d1ef61290995ac871a901b1e8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/async.h
@@ -0,0 +1,452 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * The source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * The Licensed Deliverables contained herein are PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CG_ASYNC_H
+#define _CG_ASYNC_H
+
+#include "helpers.h"
+#include "info.h"
+
+#include <cuda_pipeline.h>
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+// Groups supported by memcpy_async
+template <class TyGroup>
+struct _async_copy_group_supported : public _CG_STL_NAMESPACE::false_type {};
+
+template <unsigned int Sz, typename TyPar>
+struct _async_copy_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>>
+    : public _CG_STL_NAMESPACE::true_type {};
+template <>
+struct _async_copy_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
+template <>
+struct _async_copy_group_supported<cooperative_groups::thread_block> : public _CG_STL_NAMESPACE::true_type {};
+
+template <class TyGroup>
+using async_copy_group_supported = _async_copy_group_supported<details::remove_qual<TyGroup>>;
+
+// Groups that require optimization
+template <class TyGroup>
+struct _async_copy_optimize_tile : public _CG_STL_NAMESPACE::false_type {};
+
+template <typename TyPar>
+struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<1, TyPar>>
+    : public _CG_STL_NAMESPACE::false_type {};
+
+template <unsigned int Sz, typename TyPar>
+struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<Sz, TyPar>>
+    : public _CG_STL_NAMESPACE::true_type {};
+
+template <class TyGroup>
+using async_copy_optimize_tile = _async_copy_optimize_tile<details::remove_qual<TyGroup>>;
+
+// SFINAE helpers for tile optimizations
+template <class TyGroup>
+using enable_tile_optimization =
+    typename _CG_STL_NAMESPACE::enable_if<async_copy_optimize_tile<TyGroup>::value, void *>::type;
+
+template <class TyGroup>
+using disable_tile_optimization =
+    typename _CG_STL_NAMESPACE::enable_if<!async_copy_optimize_tile<TyGroup>::value, void *>::type;
+
+// Segment for punning to aligned types
+template <unsigned int N>
+struct _Segment {
+    int _seg[N];
+};
+
+// Trivial layout guaranteed-aligned copy-async compatible segments
+template <unsigned int N>
+struct Segment;
+template <>
+struct __align__(4) Segment<1> : public _Segment<1>{};
+template <>
+struct __align__(8) Segment<2> : public _Segment<2>{};
+template <>
+struct __align__(16) Segment<4> : public _Segment<4>{};
+
+// Interleaved element by element copies from source to dest
+template <typename TyGroup, typename TyElem>
+_CG_STATIC_QUALIFIER void inline_copy(TyGroup &group, TyElem *__restrict__ dst, const TyElem *__restrict__ src,
+                                      size_t count) {
+    const unsigned int rank = group.thread_rank();
+    const unsigned int stride = group.size();
+
+    for (size_t idx = rank; idx < count; idx += stride) {
+        dst[idx] = src[idx];
+    }
+}
+
+template <typename TyGroup, typename TyElem, enable_tile_optimization<TyGroup> = nullptr>
+_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
+                                                 const TyElem *__restrict__ src, size_t count) {
+    static_assert(async_copy_group_supported<TyGroup>::value,
+                  "Async copy is only supported for groups that represent private shared memory");
+
+    if (count == 0) {
+        return;
+    }
+
+    const bool dstIsNotShared = !__isShared(dst);
+    const bool srcIsNotGlobal = !__isGlobal(src);
+
+    if (dstIsNotShared || srcIsNotGlobal) {
+        inline_copy(group, dst, src, count);
+        return;
+    }
+
+    const unsigned int stride = group.size();
+    const unsigned int rank = group.thread_rank();
+    // Efficient copies require warps to operate on the same amount of work at each step.
+    // remainders are handled in a separate stage to prevent branching
+    const unsigned int subWarpMask = (stride - 1);
+    const unsigned int subwarpCopies = (subWarpMask & (unsigned int)count);
+    const unsigned int maxSubwarpRank = min(rank, subwarpCopies - 1);
+
+    const size_t warpCopies = (count & (~subWarpMask));
+
+    for (size_t idx = 0; idx < warpCopies; idx += stride) {
+        size_t _srcIdx = rank + idx;
+        size_t _dstIdx = rank + idx;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+
+    if (subwarpCopies) {
+        size_t _srcIdx = warpCopies + maxSubwarpRank;
+        size_t _dstIdx = warpCopies + maxSubwarpRank;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+}
+
+template <typename TyGroup, typename TyElem, disable_tile_optimization<TyGroup> = nullptr>
+_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
+                                                 const TyElem *__restrict__ src, size_t count) {
+    static_assert(async_copy_group_supported<TyGroup>::value,
+                  "Async copy is only supported for groups that represent private shared memory");
+
+    const bool dstIsNotShared = !__isShared(dst);
+    const bool srcIsNotGlobal = !__isGlobal(src);
+
+    if (dstIsNotShared || srcIsNotGlobal) {
+        inline_copy(group, dst, src, count);
+        return;
+    }
+
+    unsigned int stride = group.size();
+    unsigned int rank = group.thread_rank();
+
+    for (size_t idx = rank; idx < count; idx += stride) {
+        size_t _srcIdx = idx;
+        size_t _dstIdx = idx;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+}
+
+// Determine best possible alignment given an input and initial conditions
+// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
+template <unsigned int MinAlignment, unsigned int MaxAlignment>
+_CG_STATIC_QUALIFIER uint32_t find_best_alignment(void *__restrict__ dst, const void *__restrict__ src) {
+    // Narrowing conversion intentional
+    uint32_t base1 = (uint32_t) reinterpret_cast<uintptr_t>(src);
+    uint32_t base2 = (uint32_t) reinterpret_cast<uintptr_t>(dst);
+
+    uint32_t diff = ((base1) ^ (base2)) & (MaxAlignment - 1);
+
+    // range [MaxAlignment, alignof(elem)], step: x >> 1
+    // over range of possible alignments, choose best available out of range
+    uint32_t out = MaxAlignment;
+#pragma unroll
+    for (uint32_t alignment = (MaxAlignment >> 1); alignment >= MinAlignment; alignment >>= 1) {
+        if (alignment & diff)
+            out = alignment;
+    }
+
+    return out;
+}
+
+// Determine best possible alignment given an input and initial conditions
+// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
+template <typename TyType, typename TyGroup>
+_CG_STATIC_QUALIFIER void copy_like(const TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                    size_t count) {
+    const char *src = reinterpret_cast<const char *>(_src);
+    char *dst = reinterpret_cast<char *>(_dst);
+
+    constexpr uint32_t targetAlignment = (uint32_t)alignof(TyType);
+
+    uint32_t base = (uint32_t) reinterpret_cast<uintptr_t>(src);
+    uint32_t alignOffset = ((~base) + 1) & (targetAlignment - 1);
+
+    inline_copy(group, dst, src, alignOffset);
+    count -= alignOffset;
+    src += alignOffset;
+    dst += alignOffset;
+
+    // Copy using the best available alignment, async_copy expects n-datums, not bytes
+    size_t asyncCount = count / sizeof(TyType);
+    accelerated_async_copy(group, reinterpret_cast<TyType *>(dst), reinterpret_cast<const TyType *>(src), asyncCount);
+    asyncCount *= sizeof(TyType);
+
+    count -= asyncCount;
+    src += asyncCount;
+    dst += asyncCount;
+    inline_copy(group, dst, src, count);
+}
+
+// We must determine alignment and manually align src/dst ourselves
+template <size_t AlignHint>
+struct _memcpy_async_align_dispatch {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ dst, const void *__restrict__ src, size_t count) {
+        uint32_t alignment = find_best_alignment<AlignHint, 16>(dst, src);
+
+        // Avoid copying the extra bytes if desired copy count is smaller
+        alignment = count < alignment ? AlignHint : alignment;
+
+        switch (alignment) {
+        default:
+        case 1:
+            inline_copy(group, reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), count);
+            break;
+        case 2:
+            inline_copy(group, reinterpret_cast<short *>(dst), reinterpret_cast<const short *>(src), count >> 1);
+            break;
+        case 4:
+            copy_like<Segment<1>>(group, dst, src, count);
+            break;
+        case 8:
+            copy_like<Segment<2>>(group, dst, src, count);
+            break;
+        case 16:
+            copy_like<Segment<4>>(group, dst, src, count);
+            break;
+        }
+    }
+};
+
+// Specialization for 4 byte alignments
+template <>
+struct _memcpy_async_align_dispatch<4> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<1> *src = reinterpret_cast<const Segment<1> *>(_src);
+        Segment<1> *dst = reinterpret_cast<Segment<1> *>(_dst);
+
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+
+// Specialization for 8 byte alignments
+template <>
+struct _memcpy_async_align_dispatch<8> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<2> *src = reinterpret_cast<const Segment<2> *>(_src);
+        Segment<2> *dst = reinterpret_cast<Segment<2> *>(_dst);
+
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+
+// Alignments over 16 are truncated to 16 and bypass alignment
+// This is the highest performing memcpy available
+template <>
+struct _memcpy_async_align_dispatch<16> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<4> *src = reinterpret_cast<const Segment<4> *>(_src);
+        Segment<4> *dst = reinterpret_cast<Segment<4> *>(_dst);
+
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+
+// byte-wide API
+template <size_t Alignment, class TyGroup>
+_CG_STATIC_QUALIFIER void _memcpy_async_dispatch_to_aligned_copy(const TyGroup &group, void *__restrict__ _dst,
+                                                                 const void *__restrict__ _src, size_t count) {
+    static_assert(!(Alignment & (Alignment - 1)), "Known static alignment dispatch must be a power of 2");
+    details::_memcpy_async_align_dispatch<Alignment>::copy(group, _dst, _src, count);
+}
+
+// Internal dispatch APIs
+// These deduce the alignments and sizes necessary to invoke the underlying copy engine
+template <typename Ty>
+using is_void = _CG_STL_NAMESPACE::is_same<Ty, void>;
+
+template <typename Ty>
+using enable_if_not_void = typename _CG_STL_NAMESPACE::enable_if<!is_void<Ty>::value, void *>::type;
+
+template <typename Ty>
+using enable_if_void = typename _CG_STL_NAMESPACE::enable_if<is_void<Ty>::value, void *>::type;
+
+template <typename Ty>
+using enable_if_integral =
+    typename _CG_STL_NAMESPACE::enable_if<_CG_STL_NAMESPACE::is_integral<Ty>::value, void *>::type;
+
+// byte-wide API using aligned_sized_t
+template <class TyGroup, template <size_t> typename Alignment, size_t Hint>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, void *__restrict__ _dst,
+                                              const void *__restrict__ _src, const Alignment<Hint> &count) {
+    constexpr size_t _align = (Hint > 16) ? 16 : Hint;
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, (size_t)count);
+}
+
+// byte-wide API using type for aligment
+template <class TyGroup, typename TyElem, typename TySize, size_t Hint = alignof(TyElem),
+          enable_if_not_void<TyElem> = nullptr, enable_if_integral<TySize> = nullptr>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
+                                              const TyElem *__restrict__ _src, const TySize& count) {
+    constexpr size_t _align = (Hint > 16) ? 16 : Hint;
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, count);
+}
+
+// byte-wide API with full alignment deduction required
+template <class TyGroup, typename TyElem, typename TySize, enable_if_void<TyElem> = nullptr,
+          enable_if_integral<TySize> = nullptr>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
+                                              const TyElem *__restrict__ _src, const TySize& count) {
+    details::_memcpy_async_dispatch_to_aligned_copy<1>(group, _dst, _src, count);
+}
+
+// 1d-datum API
+template <class TyGroup, typename TyElem, size_t Hint = alignof(TyElem)>
+_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const size_t dstCount,
+                                              const TyElem *__restrict__ src, const size_t srcCount) {
+    constexpr unsigned int _align = Hint;
+    const size_t totalCount = min(dstCount, srcCount) * sizeof(TyElem);
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
+}
+
+// 1d-datum API using aligned_size_t
+template <class TyGroup, typename TyElem, template <size_t> typename Alignment, size_t Hint>
+_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const Alignment<Hint> &dstCount,
+                                              const TyElem *__restrict__ src, const Alignment<Hint> &srcCount) {
+    constexpr unsigned int _align = Hint;
+    const size_t totalCount = min((size_t)dstCount, (size_t)srcCount) * sizeof(TyElem);
+
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
+}
+
+} // namespace details
+
+/*
+ * Group submit batch of async-copy to cover contiguous 1D array
+ * and commit that batch to eventually wait for completion.
+ */
+template <class TyGroup, typename TyElem, typename TySizeT>
+_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ _dst, const TyElem *__restrict__ _src,
+                                       const TySizeT &count) {
+    details::_memcpy_async_bytes(group, _dst, _src, count);
+    __pipeline_commit();
+}
+
+/*
+ * Group submit batch of async-copy to cover contiguous 1D array
+ * and commit that batch to eventually wait for completion.
+ * Object counts are in datum sized chunks, not bytes.
+ */
+template <class TyGroup, class TyElem, typename DstLayout, typename SrcLayout>
+_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ dst, const DstLayout &dstLayout,
+                                       const TyElem *__restrict__ src, const SrcLayout &srcLayout) {
+    details::_memcpy_async_datum(group, dst, dstLayout, src, srcLayout);
+    __pipeline_commit();
+}
+
+/* Group wait for prior Nth stage of memcpy_async to complete. */
+template <unsigned int Stage, class TyGroup>
+_CG_STATIC_QUALIFIER void wait_prior(const TyGroup &group) {
+    __pipeline_wait_prior(Stage);
+    group.sync();
+}
+
+/* Group wait all previously submitted memcpy_async to complete. */
+template <class TyGroup>
+_CG_STATIC_QUALIFIER void wait(const TyGroup &group) {
+    __pipeline_wait_prior(0);
+    group.sync();
+}
+
+/***************** CG APIs including pipeline are deprecated *****************/
+
+/* Group submit batch of async-copy to cover of contiguous 1D array
+   to a pipeline and commit the batch*/
+template <class TyGroup, class TyElem>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void memcpy_async(TyGroup &group, TyElem *dst, size_t dstCount, const TyElem *src, size_t srcCount,
+                                       nvcuda::experimental::pipeline &pipe) {
+    details::_memcpy_async_datum(group, dst, dstCount, src, srcCount);
+    pipe.commit();
+}
+
+/* Group wait for prior Nth stage of memcpy_async to complete. */
+template <unsigned int Stage, class TyGroup>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait_prior(TyGroup &group, nvcuda::experimental::pipeline &pipe) {
+    pipe.wait_prior<Stage>();
+    group.sync();
+}
+
+/* Group wait for stage-S of memcpy_async to complete. */
+template <class TyGroup>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait(TyGroup &group, nvcuda::experimental::pipeline &pipe, size_t stage) {
+    pipe.wait(stage);
+    group.sync();
+}
+_CG_END_NAMESPACE
+
+#endif // _CG_ASYNC_H
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_scan.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..383f4bde059dd8daad7d1c56e99152ea7ee28a08
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_scan.h
@@ -0,0 +1,174 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_COALESCED_SCAN_H_
+#define _CG_COALESCED_SCAN_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "cooperative_groups.h"
+#include "partitioning.h"
+#include "functional.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+template <typename TyGroup, typename TyVal, typename TyOp>
+_CG_QUALIFIER auto inclusive_scan_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    auto out = val;
+    for (int mask = 1; mask < group.size(); mask <<= 1) {
+        auto tmp = group.shfl_up(out, mask);
+        if (mask <= group.thread_rank()) {
+            out = op(out, tmp);
+        }
+    }
+
+    return out;
+}
+
+template <typename TyGroup, typename TyVal, typename TyOp>
+_CG_QUALIFIER auto inclusive_scan_non_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    const unsigned int groupSize = group.size();
+    auto out = val;
+
+    const unsigned int mask = details::_coalesced_group_data_access::get_mask(group);
+    unsigned int lanemask = details::lanemask32_lt() & mask;
+    unsigned int srcLane = details::laneid();
+
+    const unsigned int base = __ffs(mask)-1; /* lane with rank == 0 */
+    const unsigned int rank = __popc(lanemask);
+
+    for (unsigned int i = 1, j = 1; i < groupSize; i <<= 1) {
+        if (i <= rank) {
+            srcLane -= j;
+            j = i; /* maximum possible lane */
+
+            unsigned int begLane = base + rank - i; /* minimum possible lane */
+
+            /*  Next source lane is in the range [ begLane .. srcLane ]
+                *  If begLane < srcLane then do a binary search.
+                */
+            while (begLane < srcLane) {
+                const unsigned int halfLane = (begLane + srcLane) >> 1;
+                const unsigned int halfMask = lanemask >> halfLane;
+                const unsigned int d = __popc(halfMask);
+                if (d < i) {
+                    srcLane = halfLane - 1; /* halfLane too large */
+                }
+                else if ((i < d) || !(halfMask & 0x01)) {
+                    begLane = halfLane + 1; /* halfLane too small */
+                }
+                else {
+                    begLane = srcLane = halfLane; /* happen to hit */
+                }
+            }
+        }
+
+        auto tmp = details::tile::shuffle_dispatch<TyVal>::shfl(out, mask, srcLane, 32);
+        if (i <= rank) {
+            out = op(out, tmp);
+        }
+    }
+    return out;
+}
+
+template <unsigned int TySize, typename ParentT, typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_inclusive_scan(const __single_warp_thread_block_tile<TySize, ParentT>& group,
+                                            TyVal&& val,
+                                            TyOp&& op) -> decltype(op(val, val)) {
+    return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+}
+
+template <typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_inclusive_scan(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    if (group.size() == 32) {
+        return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+    else {
+        return inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+}
+
+template <bool IntegralOptimized>
+struct scan_choose_convertion;
+
+template<>
+struct scan_choose_convertion<true> {
+    template <typename TyGroup, typename TyRes, typename TyVal>
+    _CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) {
+        return result - val;
+    }
+};
+
+template<>
+struct scan_choose_convertion<false> {
+    template <typename TyGroup, typename TyRes, typename TyVal>
+    _CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) {
+        auto ret = group.shfl_up(result, 1);
+        if (group.thread_rank() == 0) {
+            return {};
+        }
+        else {
+            return ret;
+        }
+    }
+};
+
+template <typename TyGroup, typename TyRes, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    using conversion = scan_choose_convertion<_CG_STL_NAMESPACE::is_same<remove_qual<TyFn>, cooperative_groups::plus<remove_qual<TyVal>>>::value
+                                 && _CG_STL_NAMESPACE::is_integral<remove_qual<TyVal>>::value>;
+    return conversion::convert_inclusive_to_exclusive(group, result, _CG_STL_NAMESPACE::forward<TyVal>(val));
+}
+
+} // details
+
+_CG_END_NAMESPACE
+
+#endif // _CG_COALESCED_SCAN_H_
\ No newline at end of file
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/driver_abi.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/driver_abi.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c866fcf740beb709a106057d28e8a2a1ac37924
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/driver_abi.h
@@ -0,0 +1,99 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_DRIVER_API_H
+#define _CG_DRIVER_API_H
+
+#include "info.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+    template <unsigned int RegId>
+    _CG_QUALIFIER unsigned int load_env_reg() {
+        // Abort by default
+        _CG_ABORT();
+        return 0;
+    }
+
+    template <unsigned int HiReg, unsigned int LoReg>
+    _CG_QUALIFIER unsigned long long load_env_reg64() {
+        unsigned long long registerLo = load_env_reg<LoReg>();
+        unsigned long long registerHi = load_env_reg<HiReg>();
+
+        return (registerHi << 32) | registerLo;
+    }
+
+// inline PTX for accessing registers requires an immediate for the special reg
+# define LOAD_ENVREG(NUMBER) \
+    template <> _CG_QUALIFIER unsigned int load_env_reg<NUMBER>() { \
+        unsigned int r; \
+        asm ("mov.u32 %0, %%envreg" #NUMBER ";" : "=r"(r)); \
+        return r; \
+    }
+
+    // Instantiate loaders for registers used
+    LOAD_ENVREG(0);
+    LOAD_ENVREG(1);
+    LOAD_ENVREG(2);
+# undef LOAD_ENVREG
+
+    struct grid_workspace {
+        unsigned int wsSize;
+        unsigned int barrier;
+    };
+
+    _CG_QUALIFIER grid_workspace* get_grid_workspace() {
+        unsigned long long gridWsAbiAddress = load_env_reg64<1, 2>();
+        // Interpret the address from envreg 1 and 2 as the driver's grid workspace
+        return (reinterpret_cast<grid_workspace*>(gridWsAbiAddress));
+    }
+}
+_CG_END_NAMESPACE
+
+#endif // _CG_DRIVER_API_H
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a860402ea9e8be784d384d756217fd4c656538a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h
@@ -0,0 +1,344 @@
+ /* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+
+
+#ifndef _CG_INFO_H_
+#define _CG_INFO_H_
+/*
+** Define: _CG_VERSION
+*/
+#define _CG_VERSION 1000
+
+/*
+** Define: _CG_ABI_VERSION
+*/
+#ifndef _CG_ABI_VERSION
+# define _CG_ABI_VERSION 1
+#endif
+
+/*
+** Define: _CG_ABI_EXPERIMENTAL
+** Desc: If enabled, sets all features enabled (ABI-breaking or experimental)
+*/
+#if defined(_CG_ABI_EXPERIMENTAL)
+#endif
+
+#define _CG_CONCAT_INNER(x, y) x ## y
+#define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y)
+#define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION)
+
+#define _CG_BEGIN_NAMESPACE \
+    namespace cooperative_groups { namespace _CG_NAMESPACE {
+#define _CG_END_NAMESPACE \
+    }; using namespace _CG_NAMESPACE; };
+
+#if (defined(__cplusplus) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MSC_VER >= 1900))
+# define _CG_CPP11_FEATURES
+#endif
+
+#if !defined(_CG_QUALIFIER)
+# define _CG_QUALIFIER __forceinline__ __device__
+#endif
+#if !defined(_CG_STATIC_QUALIFIER)
+# define _CG_STATIC_QUALIFIER static __forceinline__ __device__
+#endif
+#if !defined(_CG_CONSTEXPR_QUALIFIER)
+# if defined(_CG_CPP11_FEATURES)
+#  define _CG_CONSTEXPR_QUALIFIER constexpr __forceinline__ __device__
+# else
+#  define _CG_CONSTEXPR_QUALIFIER _CG_QUALIFIER
+# endif
+#endif
+#if !defined(_CG_STATIC_CONSTEXPR_QUALIFIER)
+# if defined(_CG_CPP11_FEATURES)
+#  define _CG_STATIC_CONSTEXPR_QUALIFIER static constexpr __forceinline__ __device__
+# else
+#  define _CG_STATIC_CONSTEXPR_QUALIFIER _CG_STATIC_QUALIFIER
+# endif
+#endif
+
+#if defined(_MSC_VER)
+# define _CG_DEPRECATED __declspec(deprecated)
+#else
+# define _CG_DEPRECATED __attribute__((deprecated))
+#endif
+
+#if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_GRID_GROUP
+#endif
+#if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_MULTI_GRID_GROUP
+#endif
+#if (__CUDA_ARCH__ >= 700) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_MATCH_COLLECTIVE
+#endif
+
+#if (__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__) && (defined(__NVCC__) || defined(__CUDACC_RTC__))
+# define _CG_HAS_OP_REDUX
+#endif
+
+#if ((__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__)) && !defined(_CG_USER_PROVIDED_SHARED_MEMORY)
+# define _CG_HAS_RESERVED_SHARED
+#endif
+
+#if ((__CUDA_ARCH__ >= 900) || !defined(__CUDA_ARCH__)) && \
+    (defined(__NVCC__) || defined(__CUDACC_RTC__) || defined(_CG_CLUSTER_INTRINSICS_AVAILABLE)) && \
+    defined(_CG_CPP11_FEATURES)
+# define _CG_HAS_CLUSTER_GROUP
+#endif
+
+#if (__CUDA_ARCH__ >= 900) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_INSTR_ELECT
+#endif
+
+// Has __half and __half2
+// Only usable if you include the cuda_fp16.h extension, and
+// _before_ including cooperative_groups.h
+#ifdef __CUDA_FP16_TYPES_EXIST__
+# define _CG_HAS_FP16_COLLECTIVE
+#endif
+
+// Include libcu++ where supported.
+#if defined(_CG_CPP11_FEATURES) && !defined(__QNX__) && !defined(__ibmxl__) && \
+    (defined(__NVCC__) || defined(__CUDACC_RTC__)) && \
+    (defined(__x86_64__) || defined(__aarch64__) || defined(__ppc64__)|| defined(_M_X64) || defined(_M_ARM64)) && \
+    (defined(_MSC_VER) || defined(__GNUC__) || defined(__clang__))
+# define _CG_USE_CUDA_STL
+#else
+# define _CG_USE_OWN_TRAITS
+#endif
+
+#if defined(_CG_USE_CUDA_STL) && (!defined(__CUDA_ARCH__) || \
+    ((!defined(_MSC_VER) && __CUDA_ARCH__ >= 600) || (defined(_MSC_VER) && __CUDA_ARCH__ >= 700)))
+# define _CG_HAS_STL_ATOMICS
+#endif
+
+#ifdef _CG_CPP11_FEATURES
+// Use cuda::std:: for type_traits
+# if defined(_CG_USE_CUDA_STL)
+#  define _CG_STL_NAMESPACE cuda::std
+#  include <cuda/std/type_traits>
+// Use CG's implementation of type traits
+# else
+#  define _CG_STL_NAMESPACE cooperative_groups::details::templates
+# endif
+#endif
+
+#ifdef _CG_CPP11_FEATURES
+# define _CG_STATIC_CONST_DECL static constexpr
+# define _CG_CONST_DECL constexpr
+#else
+# define _CG_STATIC_CONST_DECL static const
+# define _CG_CONST_DECL const
+#endif
+
+#if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
+# define _CG_ASM_PTR_CONSTRAINT "r"
+#else
+#  define _CG_ASM_PTR_CONSTRAINT "l"
+#endif
+
+/*
+** Define: CG_DEBUG
+** What: Enables various runtime safety checks
+*/
+#if defined(__CUDACC_DEBUG__) && defined(CG_DEBUG) && !defined(NDEBUG)
+# define _CG_DEBUG
+#endif
+
+#if defined(_CG_DEBUG)
+# include <assert.h>
+# define _CG_ASSERT(x) assert((x));
+# define _CG_ABORT() assert(0);
+#else
+# define _CG_ASSERT(x)
+# define _CG_ABORT() __trap();
+#endif
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+    _CG_STATIC_CONST_DECL unsigned int default_max_block_size = 1024;
+
+#if defined(_CG_CPP11_FEATURES) && !defined(_CG_USE_CUDA_STL)
+namespace templates {
+
+/**
+ * Integral constants
+ **/
+template <typename Ty, Ty Val>
+struct integral_constant {
+    static constexpr Ty value = Val;
+    typedef Ty type;
+
+    _CG_QUALIFIER constexpr operator type() const noexcept { return value; }
+    _CG_QUALIFIER constexpr type operator()() const noexcept { return value; }
+};
+
+typedef integral_constant<bool, true>  true_type;
+typedef integral_constant<bool, false> false_type;
+
+/**
+ * CV Qualifiers
+ **/
+template <class Ty> struct is_lvalue_reference       : public details::templates::false_type {};
+template <class Ty> struct is_lvalue_reference<Ty&>  : public details::templates::true_type {};
+
+template <class Ty> struct remove_reference       {typedef Ty type;};
+template <class Ty> struct remove_reference<Ty&>  {typedef Ty type;};
+template <class Ty> struct remove_reference<Ty&&> {typedef Ty type;};
+
+template <class Ty>
+using remove_reference_t = typename details::templates::remove_reference<Ty>::type;
+
+template <class Ty> struct remove_const           {typedef Ty type;};
+template <class Ty> struct remove_const<const Ty> {typedef Ty type;};
+
+template <class Ty> struct remove_volatile              {typedef Ty type;};
+template <class Ty> struct remove_volatile<volatile Ty> {typedef Ty type;};
+
+template <class Ty> struct remove_cv {typedef typename details::templates::remove_volatile<typename details::templates::remove_const<Ty>::type>::type type;};
+
+template <class Ty>
+using remove_cv_t = typename details::templates::remove_cv<Ty>::type;
+
+template <class Ty>
+_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &t) noexcept {
+    return static_cast<Ty&&>(t);
+}
+
+template <class Ty>
+_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &&t) noexcept {
+    static_assert(!details::templates::is_lvalue_reference<Ty>::value, "Forwarding an rvalue as an lvalue is not allowed.");
+    return static_cast<Ty&&>(t);
+}
+
+/**
+ * is_integral
+ **/
+template <class Ty> struct _is_integral                     : public details::templates::false_type {};
+template <>         struct _is_integral<bool>               : public details::templates::true_type {};
+template <>         struct _is_integral<char>               : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned char>      : public details::templates::true_type {};
+template <>         struct _is_integral<short>              : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned short>     : public details::templates::true_type {};
+template <>         struct _is_integral<int>                : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned int>       : public details::templates::true_type {};
+template <>         struct _is_integral<long>               : public details::templates::true_type {};
+template <>         struct _is_integral<long long>          : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned long>      : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned long long> : public details::templates::true_type {};
+//Vector type support?
+
+template <typename Ty>
+struct is_integral : public details::templates::_is_integral<typename details::templates::remove_cv<Ty>::type> {};
+
+/**
+ * is_floating_point
+ **/
+template <class Ty> struct _is_floating_point              : public details::templates::false_type {};
+template <>         struct _is_floating_point<float>       : public details::templates::true_type {};
+template <>         struct _is_floating_point<double>      : public details::templates::true_type {};
+template <>         struct _is_floating_point<long double> : public details::templates::true_type {};
+# ifdef __CUDA_FP16_TYPES_EXIST__
+template <>         struct _is_floating_point<__half>      : public details::templates::true_type {};
+template <>         struct _is_floating_point<__half2>     : public details::templates::true_type {};
+# endif
+//Vector type support?
+
+template <typename Ty>
+struct is_floating_point : public details::templates::_is_floating_point<typename details::templates::remove_cv<Ty>::type> {};
+
+template <class T>
+struct is_arithmetic : details::templates::integral_constant<
+    bool,
+    details::templates::is_integral<T>::value ||
+    details::templates::is_floating_point<T>::value> {};
+
+template <typename Ty, bool = details::templates::is_arithmetic<Ty>::value>
+struct _is_unsigned : details::templates::integral_constant<bool, Ty(0) < Ty(-1)> {};
+
+template <typename Ty>
+struct _is_unsigned<Ty,false> : details::templates::false_type {};
+
+template <typename Ty>
+struct is_unsigned : _is_unsigned<typename details::templates::remove_cv<Ty>::type> {};
+
+template <typename Ty> struct _is_pointer      : public details::templates::false_type {};
+template <typename Ty> struct _is_pointer<Ty*> : public details::templates::true_type {};
+
+template <typename Ty>
+struct is_pointer : _is_pointer<typename details::templates::remove_cv<Ty>::type> {};
+
+/**
+ * programmatic type traits
+ **/
+template<bool B, class Ty = void>
+struct enable_if {};
+
+template<class Ty>
+struct enable_if<true, Ty> { typedef Ty type; };
+
+template<bool Cond, typename Ty = void>
+using enable_if_t = typename details::templates::enable_if<Cond, Ty>::type;
+
+template<class Ty1, class Ty2>
+struct is_same : details::templates::false_type {};
+
+template<class Ty>
+struct is_same<Ty, Ty> : details::templates::true_type {};
+
+} // templates
+#endif // _CG_CPP11_FEATURES
+
+} // details
+_CG_END_NAMESPACE
+
+
+#endif // _CG_INFO_H_
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/invoke.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/invoke.h
new file mode 100644
index 0000000000000000000000000000000000000000..f00314ce140e390be90a1ab3c328fd73d73c0d46
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/invoke.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CG_INVOKE_H
+#define _CG_INVOKE_H
+
+#include "info.h"
+#include "helpers.h"
+
+#if defined(_CG_CPP11_FEATURES)
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    template <typename Group>
+    struct _elect_group_supported : _CG_STL_NAMESPACE::false_type {};
+#ifdef _CG_HAS_INSTR_ELECT
+    template<>
+    struct _elect_group_supported<coalesced_group> : _CG_STL_NAMESPACE::true_type {};
+    template<unsigned int Size, typename Parent>
+    struct _elect_group_supported<thread_block_tile<Size, Parent>> :
+        _CG_STL_NAMESPACE::integral_constant<bool, (Size <= 32)> {};
+#endif
+
+    template <typename Group>
+    struct elect_group_supported : public _elect_group_supported<details::remove_qual<Group>> {};
+
+    template<typename Group>
+    _CG_STATIC_QUALIFIER bool elect_one(const Group& group, unsigned int mask, unsigned int& leader_lane) {
+        int is_leader = 0;
+#ifdef _CG_HAS_INSTR_ELECT
+        asm("{\n\t"
+          " .reg .pred p;\n\t"
+          "  elect.sync %0|p, %2;\n\t"
+          " @p mov.s32 %1, 1;\n\t"
+          "}"
+          : "+r"(leader_lane), "+r"(is_leader) : "r" (mask));
+#endif
+        return is_leader;
+    }
+
+    template<bool UseElect>
+    struct invoke_one_impl {};
+
+    template<>
+    struct invoke_one_impl<true> {
+        template<typename Group, typename Fn, typename... Args>
+        _CG_STATIC_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
+            auto mask = details::_coalesced_group_data_access::get_mask(group);
+            unsigned int leader_lane = 0;
+
+            if (elect_one(group, mask, leader_lane)) {
+                _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
+            }
+        }
+
+        template<typename Group, typename Fn, typename... Args>
+        _CG_STATIC_QUALIFIER auto invoke_one_broadcast(const Group& group, Fn&& fn, Args&&... args)
+                -> typename _CG_STL_NAMESPACE::remove_reference<
+                    decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
+
+            using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
+            details::remove_qual<ResultType> result;
+            auto mask = details::_coalesced_group_data_access::get_mask(group);
+            unsigned int leader_lane = 0;
+
+            if (elect_one(group, mask, leader_lane)) {
+                result = _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
+            }
+
+            // Need to use low level api instead of group.shfl, because elect_one returns lane id, not group rank.
+            return tile::shuffle_dispatch<ResultType>::shfl(result, mask, leader_lane, 32);
+        }
+    };
+
+    template<>
+    struct invoke_one_impl<false> {
+        template<typename Group, typename Fn, typename... Args>
+        _CG_STATIC_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
+            if (group.thread_rank() == 0) {
+                _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
+            }
+        }
+
+        template<typename Group, typename Fn, typename... Args>
+        _CG_STATIC_QUALIFIER auto invoke_one_broadcast(const Group& group, Fn&& fn, Args&&... args)
+                -> typename _CG_STL_NAMESPACE::remove_reference<
+                    decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
+
+            using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
+            details::remove_qual<ResultType> result;
+
+            if (group.thread_rank() == 0) {
+                result = _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
+            }
+
+            return group.shfl(result, 0);
+        }
+    };
+
+
+}; // namespace details
+
+template<typename Group, typename Fn, typename... Args>
+_CG_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
+    using impl = details::invoke_one_impl<details::elect_group_supported<Group>::value>;
+    impl::invoke_one(group, _CG_STL_NAMESPACE::forward<Fn>(fn), _CG_STL_NAMESPACE::forward<Args>(args)...);
+}
+
+template<typename Fn, typename... Args>
+_CG_QUALIFIER auto invoke_one_broadcast(const coalesced_group& group, Fn&& fn, Args&&... args)
+        -> typename _CG_STL_NAMESPACE::remove_reference<
+            decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
+
+    using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
+    static_assert(!_CG_STL_NAMESPACE::is_same<ResultType, void>::value,
+                  "For invocables returning void invoke_one should be used instead");
+    using impl = details::invoke_one_impl<details::elect_group_supported<coalesced_group>::value>;
+    return impl::invoke_one_broadcast(group,
+                                      _CG_STL_NAMESPACE::forward<Fn>(fn),
+                                      _CG_STL_NAMESPACE::forward<Args>(args)...);
+}
+
+template<unsigned int Size, typename Parent, typename Fn, typename... Args>
+_CG_QUALIFIER auto invoke_one_broadcast(const thread_block_tile<Size, Parent>& group, Fn&& fn, Args&&... args)
+        -> typename _CG_STL_NAMESPACE::remove_reference<
+            decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
+
+    using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
+    static_assert(!_CG_STL_NAMESPACE::is_same<ResultType, void>::value,
+                  "For invocables returning void invoke_one should be used instead");
+    using impl = details::invoke_one_impl<details::elect_group_supported<thread_block_tile<Size, Parent>>::value>;
+    return impl::invoke_one_broadcast(group,
+                                      _CG_STL_NAMESPACE::forward<Fn>(fn),
+                                      _CG_STL_NAMESPACE::forward<Args>(args)...);
+}
+
+_CG_END_NAMESPACE
+
+#endif //_CG_CPP11_FEATURES
+
+#endif // _CG_INVOKE_H
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/memory.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/memory.h
new file mode 100644
index 0000000000000000000000000000000000000000..47cf260f3b4e0b29bf08c948697102bf027616db
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/memory.h
@@ -0,0 +1,135 @@
+/* Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _COOPERATIVE_GROUPS_MEMORY_H_
+# define _COOPERATIVE_GROUPS_MEMORY_H_
+
+#include "info.h"
+
+_CG_BEGIN_NAMESPACE
+
+#if defined(_CG_CPP11_FEATURES)
+namespace details {
+    _CG_STATIC_CONST_DECL int scratch_num_reserved_bytes = 12;
+
+#if defined(_CG_HAS_RESERVED_SHARED)
+    _CG_STATIC_QUALIFIER void* reserved_shared_ptr()
+    {
+        void *ptr;
+        asm ("{\n\t"
+             " .reg .u32 start;\n\t"
+             " .reg .u64 extended;\n\t"
+             " mov.u32 start, %%reserved_smem_offset_1;\n\t"
+             " cvt.u64.u32 extended, start;\n\t"
+             " cvta.shared.u64 %0, extended;\n\t"
+             "}"
+             : "=" _CG_ASM_PTR_CONSTRAINT(ptr));
+        return ptr;
+    }
+#endif
+
+    struct multi_warp_scratch {
+        // One barrier per possible size of the group.
+        _CG_STATIC_CONST_DECL unsigned int memory_barriers_count = 5;
+        _CG_STATIC_CONST_DECL size_t sync_memory_size = memory_barriers_count * sizeof(barrier_t);
+
+        using communication_type = unsigned long long;
+        _CG_STATIC_CONST_DECL size_t communication_size = sizeof(communication_type);
+
+        // Layout of the scratch space:
+        barrier_t barriers[memory_barriers_count];
+        char reserved[scratch_num_reserved_bytes]; // Reserve 12 bytes for future use
+        communication_type communication_memory[default_max_block_size / 32];
+
+        _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int max_block_size) {
+            // One slot of collectives memory per warp.
+            return scratch_num_reserved_bytes + sync_memory_size + max_block_size / 32 * communication_size;
+        }
+
+        _CG_QUALIFIER void init_barriers(unsigned int thread_rank) {
+            if (thread_rank < memory_barriers_count) {
+                barriers[thread_rank] = 0;
+            }
+        }
+    };
+
+#if defined(_CG_HAS_RESERVED_SHARED)
+    // CG can expect at least 288 bytes available in reserved shared
+    static_assert(sizeof(multi_warp_scratch) <= 288, "multi-warp scratch size is too large");
+#endif
+
+    // Make sure the structure can fit into the user provided memory
+    static_assert(sizeof(multi_warp_scratch) <= multi_warp_scratch::scratch_size_needed(default_max_block_size),
+                  "multi-warp scratch size is too large");
+
+
+    _CG_QUALIFIER multi_warp_scratch* get_scratch_ptr(void* user_scratch) {
+        void *ptr;
+#if defined(_CG_HAS_RESERVED_SHARED)
+        ptr = reserved_shared_ptr();
+#else
+        ptr = user_scratch;
+#endif
+        return static_cast<multi_warp_scratch*>(ptr);
+
+    }
+
+}
+
+template <unsigned int MaxBlockSize = details::default_max_block_size>
+struct __align__(details::multi_warp_scratch::communication_size) block_tile_memory {
+private:
+#if !defined(_CG_HAS_RESERVED_SHARED)
+    char scratch[details::multi_warp_scratch::scratch_size_needed(MaxBlockSize)];
+#endif
+};
+#endif
+
+_CG_END_NAMESPACE
+
+#endif /* !_COOPERATIVE_GROUPS_MEMORY_H_ */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h
new file mode 100644
index 0000000000000000000000000000000000000000..9c219756c594c87be85a0a154cfa5579241a861f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h
@@ -0,0 +1,159 @@
+/*
+ * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CG_PARTITIONING_H
+#define _CG_PARTITIONING_H
+
+#include "info.h"
+#include "helpers.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER coalesced_group _binary_partition(const TyGroup &tile, bool pred) {
+        const unsigned int fullMask = ~0u;
+
+        unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
+        unsigned int predMask = pred ? 0 : fullMask;
+        unsigned int setMask = __ballot_sync(thisMask, pred);
+
+        if (setMask == thisMask || setMask == 0) {
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(thisMask);
+            _coalesced_group_data_access::modify_meta_group(subTile, 0, 1);
+            return subTile;
+        }
+        else {
+            unsigned int subMask = thisMask & (setMask ^ predMask);
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
+            _coalesced_group_data_access::modify_meta_group(subTile, pred, 2);
+            return subTile;
+        }
+    }
+
+#if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
+    template <typename TyPredicate>
+    struct _labeled_partition_dispatch {
+        template <typename TyGroup>
+        _CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, TyPredicate pred) {
+            unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
+            unsigned int thisBias = __ffs(thisMask) - 1; // Subtract 1 to index properly from [1-32]
+            unsigned int subMask = __match_any_sync(thisMask, pred);
+
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
+
+            int leaderLaneId = subTile.shfl(details::laneid(), 0);
+
+            bool isLeader = !subTile.thread_rank();
+            unsigned int leaderMask = __ballot_sync(thisMask, isLeader);
+            unsigned int tileRank = __fns(leaderMask, leaderLaneId, 0) - thisBias;
+
+            _coalesced_group_data_access::modify_meta_group(subTile, tileRank, __popc(leaderMask));
+
+            return subTile;
+        }
+    };
+
+    template <>
+    struct _labeled_partition_dispatch<bool> {
+        template <typename TyGroup>
+        _CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, bool pred) {
+            return _binary_partition(tile, pred);
+        }
+    };
+
+    template <typename TyPredicate>
+    struct _labeled_partition_dispatch<TyPredicate*> {
+        template <typename TyGroup>
+        _CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, TyPredicate* pred) {
+            auto impl = _labeled_partition_dispatch<unsigned long long>();
+            return impl(tile, reinterpret_cast<unsigned long long>(pred));
+        }
+    };
+#endif
+}; // namespace details
+
+_CG_STATIC_QUALIFIER coalesced_group binary_partition(const coalesced_group &tile, bool pred) {
+    return details::_binary_partition(tile, pred);
+}
+
+template <unsigned int Size, typename ParentT>
+_CG_STATIC_QUALIFIER coalesced_group binary_partition(const thread_block_tile<Size, ParentT> &tile, bool pred) {
+#ifdef _CG_CPP11_FEATURES
+    static_assert(Size <= 32, "Binary partition is available only for tiles of size smaller or equal to 32");
+#endif
+    return details::_binary_partition(tile, pred);
+}
+
+
+#if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
+template <typename TyPredicate>
+_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const coalesced_group &tile, TyPredicate pred) {
+    static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value ||
+                  _CG_STL_NAMESPACE::is_pointer<TyPredicate>::value,
+                  "labeled_partition predicate must be an integral or pointer type");
+    auto dispatch = details::_labeled_partition_dispatch<details::remove_qual<TyPredicate>>();
+    return dispatch(tile, pred);
+}
+
+template <typename TyPredicate, unsigned int Size, typename ParentT>
+_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const thread_block_tile<Size, ParentT> &tile, TyPredicate pred) {
+    static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value ||
+                  _CG_STL_NAMESPACE::is_pointer<TyPredicate>::value,
+                  "labeled_partition predicate must be an integral or pointer type");
+    static_assert(Size <= 32, "Labeled partition is available only for tiles of size smaller or equal to 32");
+    auto dispatch = details::_labeled_partition_dispatch<details::remove_qual<TyPredicate>>();
+    return dispatch(tile, pred);
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif // _CG_PARTITIONING_H
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h
new file mode 100644
index 0000000000000000000000000000000000000000..0313b52a23f440e283509993d6f7997ba5df2365
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h
@@ -0,0 +1,419 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_REDUCE_H_
+#define _CG_REDUCE_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "coalesced_reduce.h"
+#include "functional.h"
+#include "cooperative_groups.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    template <class Ty>
+    using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant<
+            bool,
+            _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>;
+
+    template <class Ty>
+    using redux_is_add_supported = _redux_is_add_supported<Ty>;
+
+    // A specialization for 64 bit logical operations is possible
+    // but for now only accelerate 32 bit bitwise ops
+    template <class Ty>
+    using redux_is_logical_supported = redux_is_add_supported<Ty>;
+
+    // Base operator support case
+    template <class TyOp, class Ty> struct _redux_op_supported                 : public _CG_STL_NAMESPACE::false_type {};
+#ifdef _CG_HAS_OP_REDUX
+    template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>,    Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>,    Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>,  Ty> : public redux_is_logical_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
+#endif
+
+    template <class Ty, template <class> class TyOp>
+    using redux_op_supported = _redux_op_supported<
+            typename details::remove_qual<TyOp<Ty>>,
+            Ty>;
+
+    // Groups smaller than 16 actually have worse performance characteristics when used with redux
+    // tiles of size 16 and 32 perform the same or better and have better code generation profiles
+    template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {};
+
+    template <unsigned int Sz, typename TyPar>
+    struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
+                                                                                            bool,
+                                                                                            (Sz >= 16)> {};
+    template <unsigned int Sz, typename TyPar>
+    struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::integral_constant<
+                                                                                            bool,
+                                                                                            (Sz >= 16)> {};
+    template <>
+    struct _redux_group_optimized<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type  {};
+
+    template <typename TyGroup>
+    using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>;
+
+    template <template <class> class TyOp>
+    _CG_STATIC_QUALIFIER int pick_redux(int mask, int val);
+    template <template <class> class TyOp>
+    _CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val);
+
+#ifdef _CG_HAS_OP_REDUX
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) {
+        return __reduce_add_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) {
+        return __reduce_min_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) {
+        return __reduce_max_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) {
+        return __reduce_and_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) {
+        return __reduce_xor_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) {
+        return __reduce_or_sync(mask, val);
+    }
+
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) {
+        return __reduce_add_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) {
+        return __reduce_min_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) {
+        return __reduce_max_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) {
+        return __reduce_and_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) {
+        return __reduce_xor_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) {
+        return __reduce_or_sync(mask, val);
+    }
+#endif
+
+
+    template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value>
+    struct _accelerated_op;
+
+    // Signed type redux intrinsic dispatch
+    template <typename TyVal>
+    struct _accelerated_op<TyVal, false> {
+        template <template <class> class TyOp>
+        _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
+            return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val)));
+        }
+    };
+
+    // Unsigned type redux intrinsic dispatch
+    template <typename TyVal>
+    struct _accelerated_op<TyVal, true> {
+        template <template <class> class TyOp>
+        _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
+            return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val)));
+        }
+    };
+
+    template <typename TyVal>
+    using accelerated_op = _accelerated_op<TyVal>;
+
+
+    template <typename TyVal, typename TyFnInput, typename TyGroup>
+    class _redux_dispatch {
+        template <class Ty, template <class> class TyOp>
+        using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool,
+            redux_op_supported<Ty, TyOp>::value &&
+            redux_group_optimized<TyGroup>::value>;
+
+        template <class Ty, template <class> class TyOp>
+        using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*;
+
+        template <class Ty, template <class> class TyOp>
+        using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*;
+
+    public:
+        // Dispatch to redux if the combination of op and args are supported
+        template<
+            template <class> class TyOp,
+            redux_is_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+            // Retrieve the mask for the group and dispatch to redux
+            return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
+        }
+
+        template<
+            template <class> class TyOp,
+            redux_is_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
+            // Retrieve the mask for the group and dispatch to redux
+            return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
+        }
+
+        // Fallback shuffle sync reduction
+        template <
+            template <class> class TyOp,
+            redux_is_not_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+            //Dispatch to fallback shuffle sync accelerated reduction
+            return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+        }
+
+    };
+
+    // Group support for reduce.
+    template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {};
+
+    template <unsigned int Sz, typename TyPar>
+    struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::true_type {};
+    template <>
+    struct _reduce_group_supported<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type {};
+
+    template <typename TyGroup>
+    using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>;
+
+    template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+        static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
+
+        using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
+        return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+    }
+
+    template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
+        static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
+
+        using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
+        return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+    }
+
+
+    template <typename TyVal, typename TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+        return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+
+    template <unsigned int GroupId>
+    struct tile_reduce_dispatch;
+
+    template <>
+    struct tile_reduce_dispatch<details::coalesced_group_id> {
+        template <typename TyGroup, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+        }
+    };
+
+#if defined(_CG_CPP11_FEATURES)
+    template <>
+    struct tile_reduce_dispatch<details::multi_tile_group_id> {
+        template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                    *warp_scratch_location =
+                        details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            };
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    *thread_scratch_location =
+                        details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            };
+            return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+        }
+    };
+
+    template <unsigned int GroupId>
+    struct tile_async_reduce_dispatch;
+
+    template <>
+    struct tile_async_reduce_dispatch<details::coalesced_group_id> {
+        template <typename GroupT, typename TyDst, typename TyVal, typename TyFn, typename TyResHandler>
+        _CG_STATIC_QUALIFIER void reduce(const GroupT& group, TyDst& dst, TyVal&& val, TyFn&& op, TyResHandler& res_handler) {
+            // Do regular, in group reduction
+            auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+
+            // One thread stores/updates the destination
+            if (group.thread_rank() == 0) {
+                res_handler(result);
+            }
+        }
+    };
+
+    template <>
+    struct tile_async_reduce_dispatch<details::multi_tile_group_id> {
+        template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn, typename TyResHandler>
+        _CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op, TyResHandler& res_handler) {
+            using TyVal = remove_qual<TyInputVal>;
+            const unsigned int num_warps = TySize / 32;
+            details::barrier_t* sync_location = multi_warp_sync_location_getter(group);
+            auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32);
+
+            // Do in warp reduce
+            auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>();
+            *warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op);
+
+            // Tile of size num_warps from the last warp to arrive does final reduction step
+            if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) {
+                auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>();
+                if (subwarp.meta_group_rank() == 0) {
+                    auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank());
+                    auto thread_val = *thread_scratch_location;
+                    // Release other warps, we read their contribution already.
+                    subwarp.sync();
+                    details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps);
+                    TyVal result = details::reduce(subwarp, thread_val, op);
+                    // One thread stores the result or updates the atomic
+                    if (subwarp.thread_rank() == 0) {
+                        res_handler(result);
+                    }
+                }
+                warp.sync();
+            }
+        }
+    };
+#endif
+
+    template <typename TyGroup, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_reduce_params() {
+        static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
+        static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
+    };
+
+    template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_async_reduce_params() {
+        check_reduce_params<TyGroup, TyInputVal, TyRetVal>();
+        static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
+    }
+} // details
+
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>();
+
+    using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>;
+    return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+#if defined(_CG_CPP11_FEATURES)
+
+# if defined(_CG_HAS_STL_ATOMICS)
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto update_lambda = [&] (TyVal& result) {
+        details::atomic_update(dst, result, op);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
+}
+
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto update_lambda = [&] (TyVal& result) {
+        details::atomic_update(dst, result, op);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
+}
+
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_store_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto store_lambda = [&] (TyVal& result) {
+        details::atomic_store(dst, result);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
+}
+
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_store_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto store_lambda = [&] (TyVal& result) {
+        details::atomic_store(dst, result);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
+}
+# endif
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto store_lambda = [&] (TyVal& result) {
+        *dst = result;
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif // _CG_REDUCE_H_
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h
new file mode 100644
index 0000000000000000000000000000000000000000..96d68350e48307d120289e22872abc66f5188115
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h
@@ -0,0 +1,320 @@
+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_SCAN_H_
+#define _CG_SCAN_H_
+
+#include "info.h"
+#include "helpers.h"
+#include "functional.h"
+#include "coalesced_scan.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details {
+
+    // Group support for scan.
+    template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {};
+
+    template <unsigned int Sz, typename TyPar>
+    struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::true_type {};
+    template <>
+    struct _scan_group_supported<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type {};
+
+    template <typename TyGroup>
+    using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>;
+
+    template <bool IsIntegralPlus>
+    struct integral_optimized_scan;
+
+    enum class ScanType { exclusive, inclusive };
+
+    template <unsigned int GroupId,  ScanType TyScan>
+    struct scan_dispatch;
+
+    template <ScanType TyScan>
+    struct scan_dispatch<details::coalesced_group_id, TyScan> {
+        template <typename TyGroup, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            auto scan_result = coalesced_inclusive_scan(group, val, op);
+            if (TyScan == ScanType::exclusive) {
+                scan_result = convert_inclusive_to_exclusive(group,
+                                                             scan_result,
+                                                             _CG_STL_NAMESPACE::forward<TyVal>(val),
+                                                             _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+            return scan_result;
+        }
+    };
+
+#if defined(_CG_CPP11_FEATURES)
+    template <ScanType TyScan>
+    struct scan_dispatch<details::multi_tile_group_id, TyScan> {
+        template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            // In warp scan result, calculated in warp_lambda
+            TyRet warp_scan;
+
+            // In warp scan, put sum in the warp_scratch_location
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                warp_scan = 
+                    details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+                if (warp.thread_rank() + 1 == warp.size()) {
+                    *warp_scratch_location = warp_scan;
+                }
+                if (TyScan == ScanType::exclusive) {
+                    warp_scan = warp.shfl_up(warp_scan, 1);
+                }
+            };
+
+            // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
+            // to its in-warp scan result
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    auto thread_val = *thread_scratch_location;
+                    auto result = coalesced_inclusive_scan(subwarp, thread_val, op);
+                    *thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op);
+            };
+
+            TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+            if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
+                return previous_warps_sum;
+            }
+            if (warpType::meta_group_rank() == 0) {
+                return warp_scan;
+            }
+            else {
+                return op(warp_scan, previous_warps_sum);
+            }
+        }
+    };
+
+#if defined(_CG_HAS_STL_ATOMICS)
+    template <unsigned int GroupId,  ScanType TyScan>
+    struct scan_update_dispatch;
+
+    template <ScanType TyScan>
+    struct scan_update_dispatch<details::coalesced_group_id, TyScan> {
+        template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            details::remove_qual<TyVal> old;
+
+            // Do regular in group scan
+            auto scan_result = details::coalesced_inclusive_scan(group, val, op);
+
+            // Last thread updates the atomic and distributes its old value to other threads
+            if (group.thread_rank() == group.size() - 1) {                                                
+                old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+            old = group.shfl(old, group.size() - 1);
+            if (TyScan == ScanType::exclusive) {
+                scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            }
+            scan_result = op(old, scan_result);
+            return scan_result;
+        }
+    };
+
+    template <ScanType TyScan>
+    struct scan_update_dispatch<details::multi_tile_group_id, TyScan> {
+        template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            // In warp scan result, calculated in warp_lambda
+            TyRet warp_scan;
+
+            // In warp scan, put sum in the warp_scratch_location
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                warp_scan = 
+                    details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+                if (warp.thread_rank() + 1 == warp.size()) {
+                    *warp_scratch_location = warp_scan;
+                }
+                if (TyScan == ScanType::exclusive) {
+                    warp_scan = warp.shfl_up(warp_scan, 1);
+                }
+            };
+
+            // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
+            // to its in-warp scan result
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    auto thread_val = *thread_scratch_location;
+                    auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op);
+                    TyRet offset;
+                    // Single thread does the atomic update with sum of all contributions and reads the old value.
+                    if (subwarp.thread_rank() == subwarp.size() - 1) {
+                        offset = details::atomic_update(dst, scan_result, op);
+                    }
+                    offset = subwarp.shfl(offset, subwarp.size() - 1);
+                    scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op);
+                    // Add offset read from the atomic to the scanned warp sum.
+                    // Skipping first thread, since it got defautly constructed value from the conversion,
+                    // it should just return the offset received from the thread that did the atomic update.
+                    if (subwarp.thread_rank() != 0) {
+                        offset = op(scan_result, offset);
+                    }
+                    *thread_scratch_location = offset;
+            };
+
+            TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+            if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
+                return previous_warps_sum;
+            }
+            return op(warp_scan, previous_warps_sum);
+        }
+    };
+#endif
+#endif
+
+    template <typename TyGroup, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_scan_params() {
+        static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
+        static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
+    }
+
+#if defined(_CG_HAS_STL_ATOMICS)
+    template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_scan_update_params() {
+        check_scan_params<TyGroup, TyInputVal, TyRetVal>();
+        static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
+    }
+#endif
+
+} // details
+
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
+
+    using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template <typename TyGroup, typename TyVal>
+_CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) {
+    return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
+}
+
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
+
+    using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template <typename TyGroup, typename TyVal>
+_CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) {
+    return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
+}
+
+#if defined(_CG_HAS_STL_ATOMICS)
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) {
+    return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) {
+    return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) {
+    return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) {
+    return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+#endif
+
+_CG_END_NAMESPACE
+
+#endif // _CG_SCAN_H_
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/sync.h b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/sync.h
new file mode 100644
index 0000000000000000000000000000000000000000..44a4f56daac3d551495d3cde169775a805ca47c8
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/sync.h
@@ -0,0 +1,282 @@
+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+
+#ifndef _CG_GRID_H
+#define _CG_GRID_H
+
+#include "info.h"
+
+_CG_BEGIN_NAMESPACE
+
+namespace details
+{
+
+typedef unsigned int barrier_t;
+
+_CG_STATIC_QUALIFIER bool bar_has_flipped(unsigned int old_arrive, unsigned int current_arrive) {
+    return (((old_arrive ^ current_arrive) & 0x80000000) != 0);
+}
+
+_CG_STATIC_QUALIFIER bool is_cta_master() {
+    return (threadIdx.x + threadIdx.y + threadIdx.z == 0);
+}
+
+_CG_STATIC_QUALIFIER unsigned int sync_grids_arrive(volatile barrier_t *arrived) {
+    unsigned int oldArrive = 0;
+
+    __barrier_sync(0);
+
+    if (is_cta_master()) {
+        unsigned int expected = gridDim.x * gridDim.y * gridDim.z;
+        bool gpu_master = (blockIdx.x + blockIdx.y + blockIdx.z == 0);
+        unsigned int nb = 1;
+
+        if (gpu_master) {
+            nb = 0x80000000 - (expected - 1);
+        }
+
+#if __CUDA_ARCH__ < 700
+        // Fence; barrier update; volatile polling; fence
+        __threadfence();
+
+        oldArrive = atomicAdd((unsigned int*)arrived, nb);
+#else
+        // Barrier update with release; polling with acquire
+        asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(oldArrive) : _CG_ASM_PTR_CONSTRAINT((unsigned int*)arrived), "r"(nb) : "memory");
+#endif
+    }
+
+    return oldArrive;
+}
+
+
+_CG_STATIC_QUALIFIER void sync_grids_wait(unsigned int oldArrive, volatile barrier_t *arrived) {
+    if (is_cta_master()) {
+#if __CUDA_ARCH__ < 700
+        while (!bar_has_flipped(oldArrive, *arrived));
+
+        __threadfence();
+
+#else
+        unsigned int current_arrive;
+        do {
+            asm volatile("ld.acquire.gpu.u32 %0,[%1];" : "=r"(current_arrive) : _CG_ASM_PTR_CONSTRAINT((unsigned int *)arrived) : "memory");
+        } while (!bar_has_flipped(oldArrive, current_arrive));
+#endif
+    }
+
+    __barrier_sync(0);
+}
+
+/* - Multi warp groups synchronization routines - */
+
+// Need both acquire and release for the last warp, since it won't be able to acquire with red.and
+_CG_STATIC_QUALIFIER unsigned int atom_or_acq_rel_cta(unsigned int *addr, unsigned int val) {
+    unsigned int old;
+#if __CUDA_ARCH__ < 700
+    __threadfence_block();
+    old = atomicOr(addr, val);
+#else
+    asm volatile("atom.or.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(old) : _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");
+#endif
+    return old;
+}
+
+// Special case where barrier is arrived, but not waited on
+_CG_STATIC_QUALIFIER void red_or_release_cta(unsigned int *addr, unsigned int val) {
+#if __CUDA_ARCH__ < 700
+    __threadfence_block();
+    atomicOr(addr, val);
+#else
+    asm volatile("red.or.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");
+#endif
+}
+
+// Usually called by last arriving warp to released other warps, can be relaxed, since or was already acq_rel
+_CG_STATIC_QUALIFIER void red_and_relaxed_cta(unsigned int *addr, unsigned int val) {
+#if __CUDA_ARCH__ < 700
+    atomicAnd(addr, val);
+#else
+    asm volatile("red.and.relaxed.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");
+#endif
+}
+
+// Special case of release, where last warp was doing extra work before releasing others, need to be release
+//  to ensure that extra work is visible
+_CG_STATIC_QUALIFIER void red_and_release_cta(unsigned int *addr, unsigned int val) {
+#if __CUDA_ARCH__ < 700
+    __threadfence_block();
+    atomicAnd(addr, val);
+#else
+    asm volatile("red.and.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");
+#endif
+}
+
+// Read the barrier, acquire to ensure all memory operations following the sync are correctly performed after it is released
+_CG_STATIC_QUALIFIER unsigned int ld_acquire_cta(unsigned int *addr) {
+    unsigned int val;
+#if __CUDA_ARCH__ < 700
+    val = *((volatile unsigned int*) addr);
+    __threadfence_block();
+#else
+    asm volatile("ld.acquire.cta.u32 %0,[%1];" : "=r"(val) : _CG_ASM_PTR_CONSTRAINT(addr) : "memory");
+#endif
+    return val;
+}
+
+// Get synchronization bit mask of my thread_block_tile of size num_warps. Thread ranks 0..31 have the first bit assigned to them,
+// thread ranks 32..63 second etc 
+// Bit masks are unique for each group, groups of the same size will have the same number of bits set, but on different positions 
+_CG_STATIC_QUALIFIER unsigned int get_group_mask(unsigned int thread_rank, unsigned int num_warps) {
+    return num_warps == 32 ? ~0 : ((1 << num_warps) - 1) << (num_warps * (thread_rank / (num_warps * 32)));
+}
+
+_CG_STATIC_QUALIFIER void barrier_wait(barrier_t *arrived, unsigned int warp_bit) {
+    while(ld_acquire_cta(arrived) & warp_bit);
+}
+
+// Default blocking sync.
+_CG_STATIC_QUALIFIER void sync_warps(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+
+    __syncwarp(0xFFFFFFFF);
+
+    if (warp_master) {
+        unsigned int old = atom_or_acq_rel_cta(arrived, warp_bit);
+        if (((old | warp_bit) & group_mask) == group_mask) {
+            red_and_relaxed_cta(arrived, ~group_mask);
+        }
+        else {
+            barrier_wait(arrived, warp_bit);
+        }
+    }
+
+    __syncwarp(0xFFFFFFFF);
+}
+
+// Blocking sync, except the last arriving warp, that releases other warps, returns to do other stuff first.
+// Warp returning true from this function needs to call sync_warps_release.
+_CG_STATIC_QUALIFIER bool sync_warps_last_releases(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+
+    __syncwarp(0xFFFFFFFF);
+
+    unsigned int old = 0;
+    if (warp_master) {
+        old = atom_or_acq_rel_cta(arrived, warp_bit);
+    }
+    old = __shfl_sync(0xFFFFFFFF, old, 0);
+    if (((old | warp_bit) & group_mask) == group_mask) {
+        return true;
+    }
+    barrier_wait(arrived, warp_bit);
+
+    return false;
+}
+
+// Release my group from the barrier.
+_CG_STATIC_QUALIFIER void sync_warps_release(barrier_t *arrived, bool is_master, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+    if (is_master) {
+        red_and_release_cta(arrived, ~group_mask);
+    }
+}
+
+// Arrive at my group barrier, but don't block or release the barrier, even if every one arrives.
+// sync_warps_release needs to be called by some warp after this one to reset the barrier.
+_CG_STATIC_QUALIFIER void sync_warps_arrive(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+
+    __syncwarp(0xFFFFFFFF);
+
+    if (warp_master) {
+        red_or_release_cta(arrived, warp_bit);
+    }
+}
+
+// Wait for my warp to be released from the barrier. Warp must have arrived first.
+_CG_STATIC_QUALIFIER void sync_warps_wait(barrier_t *arrived, unsigned int thread_rank) {
+    unsigned int warp_id = thread_rank / 32;
+    unsigned int warp_bit = 1 << warp_id;
+
+    barrier_wait(arrived, warp_bit);
+}
+
+// Wait for specific warp to arrive at the barrier
+_CG_QUALIFIER void sync_warps_wait_for_specific_warp(barrier_t *arrived, unsigned int wait_warp_id) {
+    unsigned int wait_mask = 1 << wait_warp_id;
+    while((ld_acquire_cta(arrived) & wait_mask) != wait_mask);
+}
+
+// Initialize the bit corresponding to my warp in the barrier
+_CG_QUALIFIER void sync_warps_reset(barrier_t *arrived, unsigned int thread_rank) {
+    unsigned int warp_id = thread_rank / 32;
+    unsigned int warp_bit = 1 << warp_id;
+
+    __syncwarp(0xFFFFFFFF);
+
+    if (thread_rank % 32 == 0) {
+        red_and_release_cta(arrived, ~warp_bit);
+    }
+    // No need to sync after the atomic, there will be a sync of the group that is being partitioned right after this.
+}
+
+} // details
+
+_CG_END_NAMESPACE
+
+#endif // _CG_GRID_H
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eef47b306554c3440fcf06b0cf120ea05f992104
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12 b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12
new file mode 100644
index 0000000000000000000000000000000000000000..9b5121d6c2bab36a1b404058da1e8488f2933c94
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:8774224f5b11a73b15d074a3fcce7327322c5c4cfdfd924d6a826779eec968fe
+size 707904
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cudnn/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cudnn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h b/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e08847c95f1294bc99e96e737a53cc6ebb7a458
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*   cudnn : Neural Networks Library  */
+
+#if !defined(CUDNN_H_)
+#define CUDNN_H_
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+#include <cuda_runtime_api.h>
+#include "cudnn_version.h"
+#include "cudnn_graph.h"
+#include "cudnn_ops.h"
+#include "cudnn_adv.h"
+#include "cudnn_cnn.h"
+
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_H_ */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h b/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h
new file mode 100644
index 0000000000000000000000000000000000000000..b67d6529aa4e6f9a3605ce7b34499714fe4057aa
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h
@@ -0,0 +1,671 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*   cudnn_adv : cuDNN's advanced and experimental features.
+
+*/
+
+#if !defined(CUDNN_ADV_H_)
+#define CUDNN_ADV_H_
+
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_ops.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_ADV_MAJOR 9
+#define CUDNN_ADV_MINOR 1
+#define CUDNN_ADV_PATCH 0
+
+#if (CUDNN_ADV_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_MINOR != CUDNN_MINOR) || (CUDNN_ADV_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN ADV INFER!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* BASIC RNN API */
+
+typedef enum {
+    CUDNN_RNN_ALGO_STANDARD               = 0,
+    CUDNN_RNN_ALGO_PERSIST_STATIC         = 1,
+    CUDNN_RNN_ALGO_PERSIST_DYNAMIC        = 2,
+    CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
+    CUDNN_RNN_ALGO_COUNT                  = 4,
+} cudnnRNNAlgo_t;
+
+typedef enum {
+    CUDNN_FWD_MODE_INFERENCE = 0,
+    CUDNN_FWD_MODE_TRAINING  = 1,
+} cudnnForwardMode_t;
+
+typedef enum {
+    CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
+    CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
+    CUDNN_LSTM     = 2, /* LSTM with optional recurrent projection and clipping */
+    CUDNN_GRU      = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
+} cudnnRNNMode_t;
+
+typedef enum {
+    CUDNN_RNN_NO_BIAS         = 0, /* rnn cell formulas do not use biases */
+    CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
+    CUDNN_RNN_DOUBLE_BIAS     = 2, /* default, rnn cell formulas use two bias vectors */
+    CUDNN_RNN_SINGLE_REC_BIAS = 3  /* rnn cell formulas use one recurrent bias in recurrent GEMM */
+} cudnnRNNBiasMode_t;
+
+typedef enum {
+    CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
+    CUDNN_BIDIRECTIONAL  = 1, /* output concatination at each layer */
+} cudnnDirectionMode_t;
+
+typedef enum {
+    CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
+    CUDNN_SKIP_INPUT   = 1, /* fixed identity matrix in the first layer input GEMM */
+} cudnnRNNInputMode_t;
+
+typedef enum {
+    CUDNN_RNN_CLIP_NONE   = 0, /* disables LSTM cell clipping */
+    CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
+} cudnnRNNClipMode_t;
+
+typedef enum {
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED   = 0, /* padded, outer stride from one time-step to the next */
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED     = 1, /* sequence length sorted and packed as in basic RNN api */
+    CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
+} cudnnRNNDataLayout_t;
+
+/* For auxFlags in cudnnSetRNNDescriptor_v8() */
+#define CUDNN_RNN_PADDED_IO_DISABLED 0
+#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
+
+struct cudnnRNNStruct;
+typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
+
+struct cudnnRNNDataStruct;
+typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
+
+/*
+ * mathPrec in cudnnSetRNNDescriptor_v8() specifies compute precision.
+ * Compute precision is further modified by mathType that sets the
+ * preferred option for using NVIDIA Tensor Cores.  dataType specify
+ * input/output data type and weight/bias type.
+ */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t algo,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNBiasMode_t biasMode,
+                         cudnnDirectionMode_t dirMode,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDataType_t dataType,
+                         cudnnDataType_t mathPrec,
+                         cudnnMathType_t mathType,
+                         int32_t inputSize,
+                         int32_t hiddenSize,
+                         int32_t projSize,
+                         int32_t numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         uint32_t auxFlags);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNBiasMode_t *biasMode,
+                         cudnnDirectionMode_t *dirMode,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDataType_t *dataType,
+                         cudnnDataType_t *mathPrec,
+                         cudnnMathType_t *mathType,
+                         int32_t *inputSize,
+                         int32_t *hiddenSize,
+                         int32_t *projSize,
+                         int32_t *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         uint32_t *auxFlags);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t clipMode,
+                   cudnnNanPropagation_t clipNanOpt,
+                   double lclip,
+                   double rclip);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t clipMode, double lclip, double rclip);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t *clipMode,
+                   cudnnNanPropagation_t *clipNanOpt,
+                   double *lclip,
+                   double *rclip);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t *clipMode, double *lclip, double *rclip);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
+                          cudnnRNNDescriptor_t rnnDesc,
+                          cudnnForwardMode_t fwdMode,
+                          cudnnRNNDataDescriptor_t xDesc,
+                          size_t *workSpaceSize,
+                          size_t *reserveSpaceSize);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightParams(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        int32_t pseudoLayer,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        int32_t linLayerID,
+                        cudnnTensorDescriptor_t mDesc,
+                        void **mAddr,
+                        cudnnTensorDescriptor_t bDesc,
+                        void **bAddr);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill);         /* symbol for filling padding position in output */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForward(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnForwardMode_t fwdMode,
+                const int32_t devSeqLengths[],
+                cudnnRNNDataDescriptor_t xDesc,
+                const void *x,
+                cudnnRNNDataDescriptor_t yDesc,
+                void *y,
+                cudnnTensorDescriptor_t hDesc,
+                const void *hx,
+                void *hy,
+                cudnnTensorDescriptor_t cDesc,
+                const void *cx,
+                void *cy,
+                size_t weightSpaceSize,
+                const void *weightSpace,
+                size_t workSpaceSize,
+                void *workSpace,
+                size_t reserveSpaceSize,
+                void *reserveSpace);
+
+/* Sequence data descriptor */
+
+typedef enum {
+    CUDNN_SEQDATA_TIME_DIM  = 0, /* index in time */
+    CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
+    CUDNN_SEQDATA_BEAM_DIM  = 2, /* index in beam */
+    CUDNN_SEQDATA_VECT_DIM  = 3  /* index in vector */
+} cudnnSeqDataAxis_t;
+
+struct cudnnSeqDataStruct;
+typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t CUDNN_DEPRECATED;
+
+#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t dataType,
+                          int nbDims,
+                          const int dimA[],
+                          const cudnnSeqDataAxis_t axes[],
+                          size_t seqLengthArraySize,
+                          const int seqLengthArray[],
+                          void *paddingFill);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t *dataType,
+                          int *nbDims,
+                          int nbDimsRequested,
+                          int dimA[],
+                          cudnnSeqDataAxis_t axes[],
+                          size_t *seqLengthArraySize,
+                          size_t seqLengthSizeRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+
+/* Multihead Attention */
+
+/*
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
+ * Use the bitwise OR operator to combine several settings listed below.  Additional
+ * minor options can be added here w/o changing or introducing new API functions.
+ */
+#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0         /* multiple Q-s map to a single (K,V) set when beam size > 1 */
+#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
+#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0         /* no biases in attention input and output projections */
+#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1)  /* use biases in attention input and output projections */
+
+struct cudnnAttnStruct;
+typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t CUDNN_DEPRECATED;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned attnMode,
+                       int nHeads,
+                       double smScaler,
+                       cudnnDataType_t dataType,
+                       cudnnDataType_t computePrec,
+                       cudnnMathType_t mathType,
+                       cudnnDropoutDescriptor_t attnDropoutDesc,
+                       cudnnDropoutDescriptor_t postDropoutDesc,
+                       int qSize,
+                       int kSize,
+                       int vSize,
+                       int qProjSize,
+                       int kProjSize,
+                       int vProjSize,
+                       int oProjSize,
+                       int qoMaxSeqLength,
+                       int kvMaxSeqLength,
+                       int maxBatchSize,
+                       int maxBeamSize);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned *attnMode,
+                       int *nHeads,
+                       double *smScaler,
+                       cudnnDataType_t *dataType,
+                       cudnnDataType_t *computePrec,
+                       cudnnMathType_t *mathType,
+                       cudnnDropoutDescriptor_t *attnDropoutDesc,
+                       cudnnDropoutDescriptor_t *postDropoutDesc,
+                       int *qSize,
+                       int *kSize,
+                       int *vSize,
+                       int *qProjSize,
+                       int *kProjSize,
+                       int *vProjSize,
+                       int *oProjSize,
+                       int *qoMaxSeqLength,
+                       int *kvMaxSeqLength,
+                       int *maxBatchSize,
+                       int *maxBeamSize);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             size_t *weightSizeInBytes,
+                             size_t *workSpaceSizeInBytes,
+                             size_t *reserveSpaceSizeInBytes);
+
+typedef enum {
+    CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
+    CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
+    CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
+    CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
+    CUDNN_MH_ATTN_Q_BIASES  = 4, /* input projection bias tensor for 'queries' */
+    CUDNN_MH_ATTN_K_BIASES  = 5, /* input projection bias for 'keys' */
+    CUDNN_MH_ATTN_V_BIASES  = 6, /* input projection bias for 'values' */
+    CUDNN_MH_ATTN_O_BIASES  = 7, /* output projection biases */
+} cudnnMultiHeadAttnWeightKind_t;
+
+#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             cudnnMultiHeadAttnWeightKind_t wKind,
+                             size_t weightSizeInBytes,
+                             const void *weights,
+                             cudnnTensorDescriptor_t wDesc,
+                             void **wAddr);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnForward(cudnnHandle_t handle,
+                          const cudnnAttnDescriptor_t attnDesc,
+                          int currIdx,
+                          const int loWinIdx[],
+                          const int hiWinIdx[],
+                          const int devSeqLengthsQO[],
+                          const int devSeqLengthsKV[],
+                          const cudnnSeqDataDescriptor_t qDesc,
+                          const void *queries,
+                          const void *residuals,
+                          const cudnnSeqDataDescriptor_t kDesc,
+                          const void *keys,
+                          const cudnnSeqDataDescriptor_t vDesc,
+                          const void *values,
+                          const cudnnSeqDataDescriptor_t oDesc,
+                          void *out,
+                          size_t weightSizeInBytes,
+                          const void *weights,
+                          size_t workSpaceSizeInBytes,
+                          void *workSpace,
+                          size_t reserveSpaceSizeInBytes,
+                          void *reserveSpace);
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvVersionCheck(void);
+
+typedef enum {
+    CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
+    CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
+} cudnnWgradMode_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData_v8(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        const int32_t devSeqLengths[],
+                        cudnnRNNDataDescriptor_t yDesc,
+                        const void *y,
+                        const void *dy,
+                        cudnnRNNDataDescriptor_t xDesc,
+                        void *dx,
+                        cudnnTensorDescriptor_t hDesc,
+                        const void *hx,
+                        const void *dhy,
+                        void *dhx,
+                        cudnnTensorDescriptor_t cDesc,
+                        const void *cx,
+                        const void *dcy,
+                        void *dcx,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        size_t workSpaceSize,
+                        void *workSpace,
+                        size_t reserveSpaceSize,
+                        void *reserveSpace);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
+                           cudnnRNNDescriptor_t rnnDesc,
+                           cudnnWgradMode_t addGrad,
+                           const int32_t devSeqLengths[],
+                           cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           cudnnTensorDescriptor_t hDesc,
+                           const void *hx,
+                           cudnnRNNDataDescriptor_t yDesc,
+                           const void *y,
+                           size_t weightSpaceSize,
+                           void *dweightSpace,
+                           size_t workSpaceSize,
+                           void *workSpace,
+                           size_t reserveSpaceSize,
+                           void *reserveSpace);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
+                               const cudnnAttnDescriptor_t attnDesc,
+                               const int loWinIdx[],
+                               const int hiWinIdx[],
+                               const int devSeqLengthsDQDO[],
+                               const int devSeqLengthsDKDV[],
+                               const cudnnSeqDataDescriptor_t doDesc,
+                               const void *dout,
+                               const cudnnSeqDataDescriptor_t dqDesc,
+                               void *dqueries,
+                               const void *queries,
+                               const cudnnSeqDataDescriptor_t dkDesc,
+                               void *dkeys,
+                               const void *keys,
+                               const cudnnSeqDataDescriptor_t dvDesc,
+                               void *dvalues,
+                               const void *values,
+                               size_t weightSizeInBytes,
+                               const void *weights,
+                               size_t workSpaceSizeInBytes,
+                               void *workSpace,
+                               size_t reserveSpaceSizeInBytes,
+                               void *reserveSpace);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
+                                  const cudnnAttnDescriptor_t attnDesc,
+                                  cudnnWgradMode_t addGrad,
+                                  const cudnnSeqDataDescriptor_t qDesc,
+                                  const void *queries,
+                                  const cudnnSeqDataDescriptor_t kDesc,
+                                  const void *keys,
+                                  const cudnnSeqDataDescriptor_t vDesc,
+                                  const void *values,
+                                  const cudnnSeqDataDescriptor_t doDesc,
+                                  const void *dout,
+                                  size_t weightSizeInBytes,
+                                  const void *weights,
+                                  void *dweights,
+                                  size_t workSpaceSizeInBytes,
+                                  void *workSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  void *reserveSpace);
+
+/*
+* CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
+*/
+/* Input normalization mode for loss function */
+typedef enum {
+    CUDNN_LOSS_NORMALIZATION_NONE    = 0,
+    CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
+} cudnnLossNormalizationMode_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t compType,
+                            cudnnLossNormalizationMode_t normMode,
+                            cudnnNanPropagation_t gradMode);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnNanPropagation_t gradMode,
+                             int maxLabelLength);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnCTCGradMode_t ctcGradMode,
+                             int maxLabelLength);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t *compType,
+                            cudnnLossNormalizationMode_t *normMode,
+                            cudnnNanPropagation_t *gradMode);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnNanPropagation_t *gradMode,
+                             int *maxLabelLength);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnCTCGradMode_t *ctcGradMode,
+                             int *maxLabelLength);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
+
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int hostLabels[],                      /* labels, in CPU memory */
+    const int hostLabelLengths[],                /* the length of each label, in CPU memory */
+    const int hostInputLengths[],                /* the lengths of timing steps in each batch, in CPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,         /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace,              /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes); /* size of the workspace */
+
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int labels[],                          /* labels, in GPU memory */
+    const int labelLengths[],                    /* the length of each label, in GPU memory */
+    const int inputLengths[],                    /* the lengths of timing steps in each batch, in GPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,             /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    size_t workSpaceSizeInBytes, /* size of the workspace */
+    void *workspace);            /* pointer to the workspace, in GPU memory */
+
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    const int *labels,                           /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    size_t *sizeInBytes); /* pointer to the returned workspace size */
+
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    size_t *sizeInBytes);                        /* pointer to the returned workspace size */
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_ADV_H_ */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h b/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h
new file mode 100644
index 0000000000000000000000000000000000000000..5a378e2087f7a45c423f65d213d98c4fa20f3a52
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#ifndef _CUDNN_BACKEND_H_
+#define _CUDNN_BACKEND_H_
+
+/*
+ * The content of this header has been moved into cudnn_graph.h.
+ * This header is kept for the backward compatibility purpose.
+ */
+
+#include "cudnn_graph.h"
+
+#endif /* _CUDNN_BACKEND_H_ */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_graph.h b/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..c5394671423f9e950b47a61d59f9842f59a247d1
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_graph.h
@@ -0,0 +1,909 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_graph : cuDNN's basic definitions operations.
+ */
+
+#if !defined(CUDNN_GRAPH_H_)
+#define CUDNN_GRAPH_H_
+
+#include <cuda_runtime_api.h>
+#include <library_types.h>
+
+#include <stdint.h>
+
+#include "cudnn_version.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_GRAPH_MAJOR 9
+#define CUDNN_GRAPH_MINOR 1
+#define CUDNN_GRAPH_PATCH 0
+
+#if (CUDNN_GRAPH_MAJOR != CUDNN_MAJOR) || (CUDNN_GRAPH_MINOR != CUDNN_MINOR) || (CUDNN_GRAPH_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN GRAPH!!!
+#endif
+
+#ifndef CUDNNWINAPI
+#ifdef _WIN32
+#define CUDNNWINAPI __stdcall
+#else
+#define CUDNNWINAPI
+#endif
+#endif
+
+/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
+#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
+/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
+#define CUDNN_DEPRECATED __attribute__((deprecated))
+#define CUDNN_DEPRECATED_ENUM __attribute__((deprecated))
+#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
+/* Microsoft Visual C++ */
+#define CUDNN_DEPRECATED __declspec(deprecated)
+#define CUDNN_DEPRECATED_ENUM __declspec(deprecated)
+#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
+/* C++14 compilers */
+#define CUDNN_DEPRECATED [[deprecated]]
+#define CUDNN_DEPRECATED_ENUM [[deprecated]]
+#else
+/* No support for the deprecated attribute */
+#define CUDNN_DEPRECATED
+#define CUDNN_DEPRECATED_ENUM
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+struct cudnnContext;
+typedef struct cudnnContext *cudnnHandle_t;
+
+size_t CUDNNWINAPI
+cudnnGetVersion(void);
+
+size_t CUDNNWINAPI
+cudnnGetMaxDeviceVersion(void);
+
+/* Returns CUDA Runtime version statically linked against cudnn */
+size_t CUDNNWINAPI
+cudnnGetCudartVersion(void);
+
+/*
+ * CUDNN return codes
+ */
+typedef enum {
+    CUDNN_STATUS_SUCCESS = 0,
+
+    /* Uncategorized errors */
+    CUDNN_STATUS_NOT_INITIALIZED                = 1001,
+    CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH    = 1002,
+    CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH = 1003,
+    CUDNN_STATUS_DEPRECATED                     = 1004,
+    CUDNN_STATUS_LICENSE_ERROR                  = 1005,
+    CUDNN_STATUS_RUNTIME_IN_PROGRESS            = 1006,
+    CUDNN_STATUS_RUNTIME_FP_OVERFLOW            = 1007,
+
+    CUDNN_STATUS_BAD_PARAM                    = 2000,
+    CUDNN_STATUS_BAD_PARAM_NULL_POINTER       = 2002,
+    CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER = 2003,
+    CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED      = 2004,
+    CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND       = 2005,
+    CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT  = 2006,
+    CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH    = 2007,
+    CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH     = 2008,
+    CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES = 2009,
+    CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE     = 2010,
+
+    CUDNN_STATUS_NOT_SUPPORTED                              = 3000,
+    CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN                = 3001,
+    CUDNN_STATUS_NOT_SUPPORTED_SHAPE                        = 3002,
+    CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE                    = 3003,
+    CUDNN_STATUS_NOT_SUPPORTED_LAYOUT                       = 3004,
+    CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER     = 3005,
+    CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART          = 3006,
+    CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH                = 3007,
+    CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING = 3008,
+    CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE       = 3009,
+    CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT   = 3010,
+    CUDNN_STATUS_NOT_SUPPORTED_PADDING                      = 3011,
+    CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM             = 3012,
+
+    CUDNN_STATUS_INTERNAL_ERROR                          = 4000,
+    CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED       = 4001,
+    CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE         = 4002,
+    CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED   = 4003,
+    CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED = 4004,
+    CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM         = 4005,
+    CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED  = 4006,
+
+    CUDNN_STATUS_EXECUTION_FAILED             = 5000,
+    CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER = 5001,
+    CUDNN_STATUS_EXECUTION_FAILED_CUBLAS      = 5002,
+    CUDNN_STATUS_EXECUTION_FAILED_CUDART      = 5003,
+    CUDNN_STATUS_EXECUTION_FAILED_CURAND      = 5004,
+
+    CUDNN_STATUS_ALLOC_FAILED CUDNN_DEPRECATED_ENUM  = CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED,
+    CUDNN_STATUS_INVALID_VALUE CUDNN_DEPRECATED_ENUM = 2001 /* please transition to CUDNN_STATUS_BAD_PARAM instead */,
+    CUDNN_STATUS_ARCH_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH,
+    CUDNN_STATUS_MAPPING_ERROR CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED,
+    CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING CUDNN_DEPRECATED_ENUM =
+        CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING,
+    CUDNN_STATUS_VERSION_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH,
+} cudnnStatus_t;
+
+#define CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err) ((cudnnStatus_t)(0 + (category) + (specific_err)))
+#define CUDNN_STATUS_CATEGORY(full_error_code) ((full_error_code) / 1000 * 1000)
+#define CUDNN_STATUS_SPECIFIC_ERROR(full_error_code) ((full_error_code) % 1000)
+
+/* human-readable error messages */
+const char *CUDNNWINAPI
+cudnnGetErrorString(cudnnStatus_t status);
+
+void CUDNNWINAPI
+cudnnGetLastErrorString(char *message, size_t max_size);
+
+/* Forward definition in this version only */
+typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t CUDNN_DEPRECATED;
+
+typedef enum {
+    CUDNN_ERRQUERY_RAWCODE     = 0,
+    CUDNN_ERRQUERY_NONBLOCKING = 1,
+    CUDNN_ERRQUERY_BLOCKING    = 2,
+} cudnnErrQueryMode_t;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetProperty(libraryPropertyType type, int *value);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreate(cudnnHandle_t *handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroy(cudnnHandle_t handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
+/*
+ * CUDNN data type
+ */
+typedef enum {
+    CUDNN_DATA_FLOAT                         = 0,
+    CUDNN_DATA_DOUBLE                        = 1,
+    CUDNN_DATA_HALF                          = 2,
+    CUDNN_DATA_INT8                          = 3,
+    CUDNN_DATA_INT32                         = 4,
+    CUDNN_DATA_INT8x4 CUDNN_DEPRECATED_ENUM  = 5,
+    CUDNN_DATA_UINT8                         = 6,
+    CUDNN_DATA_UINT8x4 CUDNN_DEPRECATED_ENUM = 7,
+    CUDNN_DATA_INT8x32 CUDNN_DEPRECATED_ENUM = 8,
+    CUDNN_DATA_BFLOAT16                      = 9,
+    CUDNN_DATA_INT64                         = 10,
+    CUDNN_DATA_BOOLEAN                       = 11,
+    CUDNN_DATA_FP8_E4M3                      = 12,
+    CUDNN_DATA_FP8_E5M2                      = 13,
+    CUDNN_DATA_FAST_FLOAT_FOR_FP8            = 14,
+} cudnnDataType_t;
+
+/*
+ * CUDNN math type
+ */
+typedef enum {
+    CUDNN_DEFAULT_MATH                    = 0,
+    CUDNN_TENSOR_OP_MATH                  = 1,
+    CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
+    CUDNN_FMA_MATH                        = 3,
+} cudnnMathType_t;
+
+/*
+ * CUDNN propagate Nan
+ */
+typedef enum {
+    CUDNN_NOT_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 0,
+    CUDNN_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM     = 1,
+} cudnnNanPropagation_t;
+
+/*
+ * Behavior for OOB samples. OOB samples are samples where L+R > T is encountered during the gradient calculation. If
+ * gradMode is set to CUDNN_CTC_SKIP_OOB_GRADIENTS, then the CTC loss function does not write to the gradient buffer for
+ * that sample. Instead, the current values, even not finite, are retained. If gradMode is set to
+ * CUDNN_CTC_ZERO_OOB_GRADIENTS, then the gradient for that sample is set to zero. This guarantees a finite gradient.
+*/
+typedef enum {
+    CUDNN_CTC_ZERO_OOB_GRADIENTS = 0,
+    CUDNN_CTC_SKIP_OOB_GRADIENTS = 1,
+} cudnnCTCGradMode_t;
+
+typedef enum {
+    CUDNN_TENSOR_NCHW        = 0, /* row major (wStride = 1, hStride = w) */
+    CUDNN_TENSOR_NHWC        = 1, /* feature maps interleaved ( cStride = 1 )*/
+    CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
+} cudnnTensorFormat_t;
+
+/*
+ * CUDNN ReduceTensor op type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_ADD          = 0,
+    CUDNN_REDUCE_TENSOR_MUL          = 1,
+    CUDNN_REDUCE_TENSOR_MIN          = 2,
+    CUDNN_REDUCE_TENSOR_MAX          = 3,
+    CUDNN_REDUCE_TENSOR_AMAX         = 4,
+    CUDNN_REDUCE_TENSOR_AVG          = 5,
+    CUDNN_REDUCE_TENSOR_NORM1        = 6,
+    CUDNN_REDUCE_TENSOR_NORM2        = 7,
+    CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
+} cudnnReduceTensorOp_t;
+
+/*
+ * activation mode
+ */
+typedef enum {
+    CUDNN_ACTIVATION_SIGMOID      = 0,
+    CUDNN_ACTIVATION_RELU         = 1,
+    CUDNN_ACTIVATION_TANH         = 2,
+    CUDNN_ACTIVATION_CLIPPED_RELU = 3,
+    CUDNN_ACTIVATION_ELU          = 4,
+    CUDNN_ACTIVATION_IDENTITY     = 5,
+    CUDNN_ACTIVATION_SWISH        = 6
+} cudnnActivationMode_t CUDNN_DEPRECATED;
+
+typedef enum {
+    CUDNN_SEV_FATAL   = 0,
+    CUDNN_SEV_ERROR   = 1,
+    CUDNN_SEV_WARNING = 2,
+    CUDNN_SEV_INFO    = 3,
+} cudnnSeverity_t;
+
+/* Message masks to be used with cudnnSetCallback() */
+#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
+#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
+#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
+
+/* struct containing useful informaiton for each API call */
+typedef struct cudnnDebugStruct {
+    unsigned cudnn_version;
+    cudnnStatus_t cudnnStatus;
+    unsigned time_sec;      /* epoch time in seconds */
+    unsigned time_usec;     /* microseconds part of epoch time */
+    unsigned time_delta;    /* time since start in seconds */
+    cudnnHandle_t handle;   /* cudnn handle */
+    cudaStream_t stream;    /* cuda stream ID */
+    unsigned long long pid; /* process ID */
+    unsigned long long tid; /* thread ID */
+    int cudaDeviceId;       /* CUDA device ID */
+    int reserved[15];       /* reserved for future use */
+} cudnnDebug_t;
+
+typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGraphVersionCheck(void);
+
+/* Maximum supported number of tensor dimensions */
+#define CUDNN_DIM_MAX 8
+
+/*
+ *  convolution mode
+ */
+typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t;
+
+/*
+ * CUDNN Reorder
+ */
+typedef enum {
+    CUDNN_DEFAULT_REORDER = 0,
+    CUDNN_NO_REORDER      = 1,
+} cudnnReorderType_t CUDNN_DEPRECATED;
+
+typedef void *cudnnBackendDescriptor_t;
+
+typedef struct cudnnFractionStruct {
+    int64_t numerator;
+    int64_t denominator;
+} cudnnFraction_t;
+
+typedef enum {
+    CUDNN_POINTWISE_ADD        = 0,
+    CUDNN_POINTWISE_ADD_SQUARE = 5,
+    CUDNN_POINTWISE_DIV        = 6,
+    CUDNN_POINTWISE_MAX        = 3,
+    CUDNN_POINTWISE_MIN        = 2,
+    CUDNN_POINTWISE_MOD        = 7,
+    CUDNN_POINTWISE_MUL        = 1,
+    CUDNN_POINTWISE_POW        = 8,
+    CUDNN_POINTWISE_SUB        = 9,
+
+    CUDNN_POINTWISE_ABS        = 10,
+    CUDNN_POINTWISE_CEIL       = 11,
+    CUDNN_POINTWISE_COS        = 12,
+    CUDNN_POINTWISE_EXP        = 13,
+    CUDNN_POINTWISE_FLOOR      = 14,
+    CUDNN_POINTWISE_LOG        = 15,
+    CUDNN_POINTWISE_NEG        = 16,
+    CUDNN_POINTWISE_RSQRT      = 17,
+    CUDNN_POINTWISE_SIN        = 18,
+    CUDNN_POINTWISE_SQRT       = 4,
+    CUDNN_POINTWISE_TAN        = 19,
+    CUDNN_POINTWISE_ERF        = 20,
+    CUDNN_POINTWISE_IDENTITY   = 21,
+    CUDNN_POINTWISE_RECIPROCAL = 22,
+    CUDNN_POINTWISE_ATAN2      = 23,
+
+    CUDNN_POINTWISE_RELU_FWD             = 100,
+    CUDNN_POINTWISE_TANH_FWD             = 101,
+    CUDNN_POINTWISE_SIGMOID_FWD          = 102,
+    CUDNN_POINTWISE_ELU_FWD              = 103,
+    CUDNN_POINTWISE_GELU_FWD             = 104,
+    CUDNN_POINTWISE_SOFTPLUS_FWD         = 105,
+    CUDNN_POINTWISE_SWISH_FWD            = 106,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
+
+    CUDNN_POINTWISE_RELU_BWD             = 200,
+    CUDNN_POINTWISE_TANH_BWD             = 201,
+    CUDNN_POINTWISE_SIGMOID_BWD          = 202,
+    CUDNN_POINTWISE_ELU_BWD              = 203,
+    CUDNN_POINTWISE_GELU_BWD             = 204,
+    CUDNN_POINTWISE_SOFTPLUS_BWD         = 205,
+    CUDNN_POINTWISE_SWISH_BWD            = 206,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
+
+    CUDNN_POINTWISE_CMP_EQ  = 300,
+    CUDNN_POINTWISE_CMP_NEQ = 301,
+    CUDNN_POINTWISE_CMP_GT  = 302,
+    CUDNN_POINTWISE_CMP_GE  = 303,
+    CUDNN_POINTWISE_CMP_LT  = 304,
+    CUDNN_POINTWISE_CMP_LE  = 305,
+
+    CUDNN_POINTWISE_LOGICAL_AND = 400,
+    CUDNN_POINTWISE_LOGICAL_OR  = 401,
+    CUDNN_POINTWISE_LOGICAL_NOT = 402,
+
+    CUDNN_POINTWISE_GEN_INDEX = 501,
+
+    CUDNN_POINTWISE_BINARY_SELECT = 601,
+} cudnnPointwiseMode_t;
+
+typedef enum {
+    CUDNN_RESAMPLE_NEAREST                 = 0,
+    CUDNN_RESAMPLE_BILINEAR                = 1,
+    CUDNN_RESAMPLE_AVGPOOL                 = 2,
+    CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
+    CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
+    CUDNN_RESAMPLE_MAXPOOL                 = 3,
+} cudnnResampleMode_t;
+
+typedef enum {
+    CUDNN_SIGNAL_SET  = 0,
+    CUDNN_SIGNAL_WAIT = 1,
+} cudnnSignalMode_t;
+
+typedef enum {
+    CUDNN_GENSTATS_SUM_SQSUM = 0,
+} cudnnGenStatsMode_t;
+
+typedef enum {
+    CUDNN_BN_FINALIZE_STATISTICS_TRAINING  = 0,
+    CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
+} cudnnBnFinalizeStatsMode_t;
+
+typedef enum {
+    CUDNN_RNG_DISTRIBUTION_BERNOULLI,
+    CUDNN_RNG_DISTRIBUTION_UNIFORM,
+    CUDNN_RNG_DISTRIBUTION_NORMAL,
+} cudnnRngDistribution_t;
+
+typedef enum {
+    CUDNN_ATTR_POINTWISE_MODE                                  = 0,
+    CUDNN_ATTR_POINTWISE_MATH_PREC                             = 1,
+    CUDNN_ATTR_POINTWISE_NAN_PROPAGATION CUDNN_DEPRECATED_ENUM = 2,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP                       = 3,
+    CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP                       = 4,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE                 = 5,
+    CUDNN_ATTR_POINTWISE_ELU_ALPHA                             = 6,
+    CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA                         = 7,
+    CUDNN_ATTR_POINTWISE_SWISH_BETA                            = 8,
+    CUDNN_ATTR_POINTWISE_AXIS                                  = 9,
+
+    CUDNN_ATTR_CONVOLUTION_COMP_TYPE      = 100,
+    CUDNN_ATTR_CONVOLUTION_CONV_MODE      = 101,
+    CUDNN_ATTR_CONVOLUTION_DILATIONS      = 102,
+    CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
+    CUDNN_ATTR_CONVOLUTION_POST_PADDINGS  = 104,
+    CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS   = 105,
+    CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS   = 106,
+
+    CUDNN_ATTR_ENGINEHEUR_MODE            = 200,
+    CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
+    CUDNN_ATTR_ENGINEHEUR_RESULTS         = 202,
+    CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203,
+
+    CUDNN_ATTR_ENGINECFG_ENGINE            = 300,
+    CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301,
+    CUDNN_ATTR_ENGINECFG_KNOB_CHOICES      = 302,
+
+    CUDNN_ATTR_EXECUTION_PLAN_HANDLE                     = 400,
+    CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG              = 401,
+    CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE             = 402,
+    CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403,
+    CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404,
+    CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION        = 405,
+
+    CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID            = 500,
+    CUDNN_ATTR_INTERMEDIATE_INFO_SIZE                 = 501,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS  = 502,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
+
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE  = 600,
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
+
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA        = 700,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA         = 701,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC    = 702,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W            = 703,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X            = 704,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y            = 705,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA       = 706,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA        = 707,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC   = 708,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W           = 709,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX          = 710,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY          = 711,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA     = 712,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA      = 713,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW        = 715,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X         = 716,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY        = 717,
+
+    CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
+    CUDNN_ATTR_OPERATION_POINTWISE_XDESC         = 751,
+    CUDNN_ATTR_OPERATION_POINTWISE_BDESC         = 752,
+    CUDNN_ATTR_OPERATION_POINTWISE_YDESC         = 753,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1        = 754,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2        = 755,
+    CUDNN_ATTR_OPERATION_POINTWISE_DXDESC        = 756,
+    CUDNN_ATTR_OPERATION_POINTWISE_DYDESC        = 757,
+    CUDNN_ATTR_OPERATION_POINTWISE_TDESC         = 758,
+
+    CUDNN_ATTR_OPERATION_GENSTATS_MODE      = 770,
+    CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
+    CUDNN_ATTR_OPERATION_GENSTATS_XDESC     = 772,
+    CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC   = 773,
+    CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
+
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE                = 780,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC                 = 781,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC                = 782,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC             = 783,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC                = 784,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC                 = 785,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC    = 786,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC     = 787,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC  = 789,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC           = 790,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC        = 791,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC             = 792,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC              = 793,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC          = 794,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC              = 795,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC   = 796,
+
+    CUDNN_ATTR_OPERATIONGRAPH_HANDLE              = 800,
+    CUDNN_ATTR_OPERATIONGRAPH_OPS                 = 801,
+    CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802,
+
+    CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT       = 900,
+    CUDNN_ATTR_TENSOR_DATA_TYPE            = 901,
+    CUDNN_ATTR_TENSOR_DIMENSIONS           = 902,
+    CUDNN_ATTR_TENSOR_STRIDES              = 903,
+    CUDNN_ATTR_TENSOR_VECTOR_COUNT         = 904,
+    CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
+    CUDNN_ATTR_TENSOR_UNIQUE_ID            = 906,
+    CUDNN_ATTR_TENSOR_IS_VIRTUAL           = 907,
+    CUDNN_ATTR_TENSOR_IS_BY_VALUE          = 908,
+    CUDNN_ATTR_TENSOR_REORDERING_MODE      = 909,
+    CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC   = 913,
+
+    CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS    = 1000,
+    CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
+    CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
+    CUDNN_ATTR_VARIANT_PACK_WORKSPACE     = 1003,
+
+    CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
+    CUDNN_ATTR_LAYOUT_INFO_TYPES      = 1101,
+
+    CUDNN_ATTR_KNOB_INFO_TYPE          = 1200,
+    CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
+    CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
+    CUDNN_ATTR_KNOB_INFO_STRIDE        = 1203,
+
+    CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
+    CUDNN_ATTR_ENGINE_GLOBAL_INDEX    = 1301,
+    CUDNN_ATTR_ENGINE_KNOB_INFO       = 1302,
+    CUDNN_ATTR_ENGINE_NUMERICAL_NOTE  = 1303,
+    CUDNN_ATTR_ENGINE_LAYOUT_INFO     = 1304,
+    CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE   = 1305,
+    CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306,
+
+    CUDNN_ATTR_MATMUL_COMP_TYPE     = 1500,
+    CUDNN_ATTR_MATMUL_PADDING_VALUE = 1503,
+
+    CUDNN_ATTR_OPERATION_MATMUL_ADESC                                                 = 1520,
+    CUDNN_ATTR_OPERATION_MATMUL_BDESC                                                 = 1521,
+    CUDNN_ATTR_OPERATION_MATMUL_CDESC                                                 = 1522,
+    CUDNN_ATTR_OPERATION_MATMUL_DESC                                                  = 1523,
+    CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT CUDNN_DEPRECATED_ENUM = 1524,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC                                  = 1525,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC                                  = 1526,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC                                  = 1527,
+
+    CUDNN_ATTR_REDUCTION_OPERATOR  = 1600,
+    CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
+
+    CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
+    CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
+    CUDNN_ATTR_OPERATION_REDUCTION_DESC  = 1612,
+
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC        = 1620,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC        = 1621,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC      = 1622,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC    = 1623,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC           = 1624,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC          = 1625,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC   = 1626,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC    = 1627,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC  = 1629,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS          = 1630,
+
+    CUDNN_ATTR_RESAMPLE_MODE            = 1700,
+    CUDNN_ATTR_RESAMPLE_COMP_TYPE       = 1701,
+    CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS    = 1702,
+    CUDNN_ATTR_RESAMPLE_POST_PADDINGS   = 1703,
+    CUDNN_ATTR_RESAMPLE_PRE_PADDINGS    = 1704,
+    CUDNN_ATTR_RESAMPLE_STRIDES         = 1705,
+    CUDNN_ATTR_RESAMPLE_WINDOW_DIMS     = 1706,
+    CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
+    CUDNN_ATTR_RESAMPLE_PADDING_MODE    = 1708,
+
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC                       = 1710,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC                       = 1711,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC                     = 1712,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA CUDNN_DEPRECATED_ENUM = 1713,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA CUDNN_DEPRECATED_ENUM  = 1714,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC                        = 1716,
+
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC                      = 1720,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC                      = 1721,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC                     = 1722,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA CUDNN_DEPRECATED_ENUM = 1723,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA CUDNN_DEPRECATED_ENUM  = 1724,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC                        = 1725,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC                       = 1726,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC                       = 1727,
+
+    CUDNN_ATTR_OPERATION_CONCAT_AXIS          = 1800,
+    CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS   = 1801,
+    CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
+    CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC   = 1803,
+
+    CUDNN_ATTR_OPERATION_SIGNAL_MODE     = 1900,
+    CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
+    CUDNN_ATTR_OPERATION_SIGNAL_VALUE    = 1902,
+    CUDNN_ATTR_OPERATION_SIGNAL_XDESC    = 1903,
+    CUDNN_ATTR_OPERATION_SIGNAL_YDESC    = 1904,
+
+    CUDNN_ATTR_OPERATION_NORM_FWD_MODE                     = 2000,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PHASE                    = 2001,
+    CUDNN_ATTR_OPERATION_NORM_FWD_XDESC                    = 2002,
+    CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC                = 2003,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC        = 2004,
+    CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC               = 2005,
+    CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC                = 2006,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC             = 2007,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC      = 2008,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC  = 2009,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC   = 2010,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC  = 2012,
+    CUDNN_ATTR_OPERATION_NORM_FWD_YDESC                    = 2013,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS          = 2014,
+
+    CUDNN_ATTR_OPERATION_NORM_BWD_MODE              = 2100,
+    CUDNN_ATTR_OPERATION_NORM_BWD_XDESC             = 2101,
+    CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC         = 2102,
+    CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC            = 2104,
+    CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC        = 2105,
+    CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC      = 2106,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC       = 2107,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC        = 2108,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC            = 2109,
+    CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS   = 2110,
+
+    CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
+    CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,
+
+    CUDNN_ATTR_RNG_DISTRIBUTION                   = 2300,
+    CUDNN_ATTR_RNG_NORMAL_DIST_MEAN               = 2301,
+    CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM           = 2303,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM           = 2304,
+    CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY     = 2305,
+
+    CUDNN_ATTR_OPERATION_RNG_YDESC       = 2310,
+    CUDNN_ATTR_OPERATION_RNG_SEED        = 2311,
+    CUDNN_ATTR_OPERATION_RNG_DESC        = 2312,
+    CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313,
+} cudnnBackendAttributeName_t;
+
+typedef enum {
+    CUDNN_TYPE_HANDLE = 0,
+    CUDNN_TYPE_DATA_TYPE,
+    CUDNN_TYPE_BOOLEAN,
+    CUDNN_TYPE_INT64,
+    CUDNN_TYPE_FLOAT,
+    CUDNN_TYPE_DOUBLE,
+    CUDNN_TYPE_VOID_PTR,
+    CUDNN_TYPE_CONVOLUTION_MODE,
+    CUDNN_TYPE_HEUR_MODE,
+    CUDNN_TYPE_KNOB_TYPE,
+    CUDNN_TYPE_NAN_PROPOGATION CUDNN_DEPRECATED_ENUM,
+    CUDNN_TYPE_NUMERICAL_NOTE,
+    CUDNN_TYPE_LAYOUT_TYPE,
+    CUDNN_TYPE_ATTRIB_NAME,
+    CUDNN_TYPE_POINTWISE_MODE,
+    CUDNN_TYPE_BACKEND_DESCRIPTOR,
+    CUDNN_TYPE_GENSTATS_MODE,
+    CUDNN_TYPE_BN_FINALIZE_STATS_MODE,
+    CUDNN_TYPE_REDUCTION_OPERATOR_TYPE,
+    CUDNN_TYPE_BEHAVIOR_NOTE,
+    CUDNN_TYPE_TENSOR_REORDERING_MODE,
+    CUDNN_TYPE_RESAMPLE_MODE,
+    CUDNN_TYPE_PADDING_MODE,
+    CUDNN_TYPE_INT32,
+    CUDNN_TYPE_CHAR,
+    CUDNN_TYPE_SIGNAL_MODE,
+    CUDNN_TYPE_FRACTION,
+    CUDNN_TYPE_NORM_MODE,
+    CUDNN_TYPE_NORM_FWD_PHASE,
+    CUDNN_TYPE_RNG_DISTRIBUTION
+} cudnnBackendAttributeType_t;
+
+typedef enum {
+    CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0,
+    CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR,
+    CUDNN_BACKEND_ENGINE_DESCRIPTOR,
+    CUDNN_BACKEND_ENGINECFG_DESCRIPTOR,
+    CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR,
+    CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR,
+    CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR,
+    CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR,
+    CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR,
+    CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR,
+    CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR,
+    CUDNN_BACKEND_TENSOR_DESCRIPTOR,
+    CUDNN_BACKEND_MATMUL_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR,
+    CUDNN_BACKEND_REDUCTION_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR,
+    CUDNN_BACKEND_RESAMPLE_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR,
+    CUDNN_BACKEND_RNG_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR,
+} cudnnBackendDescriptorType_t;
+
+typedef enum {
+    CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0,
+    CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS,
+    CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION,
+    CUDNN_NUMERICAL_NOTE_FFT,
+    CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13,
+    CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP,
+    CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+} cudnnBackendNumericalNote_t;
+
+typedef enum {
+    CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION             = 0,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER   = 2,
+    CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+} cudnnBackendBehaviorNote_t;
+
+typedef enum {
+    CUDNN_KNOB_TYPE_SPLIT_K CUDNN_DEPRECATED_ENUM          = 0,
+    CUDNN_KNOB_TYPE_SWIZZLE                                = 1,
+    CUDNN_KNOB_TYPE_TILE_SIZE                              = 2,
+    CUDNN_KNOB_TYPE_USE_TEX CUDNN_DEPRECATED_ENUM          = 3,
+    CUDNN_KNOB_TYPE_EDGE                                   = 4,
+    CUDNN_KNOB_TYPE_KBLOCK CUDNN_DEPRECATED_ENUM           = 5,
+    CUDNN_KNOB_TYPE_LDGA CUDNN_DEPRECATED_ENUM             = 6,
+    CUDNN_KNOB_TYPE_LDGB CUDNN_DEPRECATED_ENUM             = 7,
+    CUDNN_KNOB_TYPE_CHUNK_K CUDNN_DEPRECATED_ENUM          = 8,
+    CUDNN_KNOB_TYPE_SPLIT_H CUDNN_DEPRECATED_ENUM          = 9,
+    CUDNN_KNOB_TYPE_WINO_TILE CUDNN_DEPRECATED_ENUM        = 10,
+    CUDNN_KNOB_TYPE_MULTIPLY                               = 11,
+    CUDNN_KNOB_TYPE_SPLIT_K_BUF                            = 12,
+    CUDNN_KNOB_TYPE_TILEK                                  = 13,
+    CUDNN_KNOB_TYPE_STAGES                                 = 14,
+    CUDNN_KNOB_TYPE_REDUCTION_MODE                         = 15,
+    CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE CUDNN_DEPRECATED_ENUM = 16,
+    CUDNN_KNOB_TYPE_SPLIT_K_SLC                            = 17,
+    CUDNN_KNOB_TYPE_IDX_MODE CUDNN_DEPRECATED_ENUM         = 18,
+    CUDNN_KNOB_TYPE_SLICED CUDNN_DEPRECATED_ENUM           = 19,
+    CUDNN_KNOB_TYPE_SPLIT_RS CUDNN_DEPRECATED_ENUM         = 20,
+    CUDNN_KNOB_TYPE_SINGLEBUFFER CUDNN_DEPRECATED_ENUM     = 21,
+    CUDNN_KNOB_TYPE_LDGC CUDNN_DEPRECATED_ENUM             = 22,
+    CUDNN_KNOB_TYPE_SPECFILT                               = 23,
+    CUDNN_KNOB_TYPE_KERNEL_CFG                             = 24,
+    CUDNN_KNOB_TYPE_WORKSPACE                              = 25,
+    CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM         = 26,
+    CUDNN_KNOB_TYPE_TILE_CGA_M                             = 27,
+    CUDNN_KNOB_TYPE_TILE_CGA_N                             = 28,
+    CUDNN_KNOB_TYPE_BLOCK_SIZE                             = 29,
+    CUDNN_KNOB_TYPE_OCCUPANCY                              = 30,
+    CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD                  = 31,
+    CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM  = 32,
+    CUDNN_KNOB_TYPE_SPLIT_COLS                             = 33,
+    CUDNN_KNOB_TYPE_TILE_ROWS                              = 34,
+    CUDNN_KNOB_TYPE_TILE_COLS                              = 35,
+    CUDNN_KNOB_TYPE_LOAD_SIZE                              = 36,
+    CUDNN_KNOB_TYPE_COUNTS,
+} cudnnBackendKnobType_t;
+
+typedef enum {
+    CUDNN_LAYOUT_TYPE_PREFERRED_NCHW   = 0,
+    CUDNN_LAYOUT_TYPE_PREFERRED_NHWC   = 1,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
+    CUDNN_LAYOUT_TYPE_COUNT            = 4,
+} cudnnBackendLayoutType_t;
+
+typedef enum {
+    CUDNN_HEUR_MODE_INSTANT  = 0,
+    CUDNN_HEUR_MODE_B        = 1,
+    CUDNN_HEUR_MODE_FALLBACK = 2,
+    CUDNN_HEUR_MODE_A        = 3,
+    CUDNN_HEUR_MODES_COUNT   = 4,
+} cudnnBackendHeurMode_t;
+
+typedef enum {
+    CUDNN_TENSOR_REORDERING_NONE    = 0,
+    CUDNN_TENSOR_REORDERING_INT8x32 = 1,
+    CUDNN_TENSOR_REORDERING_F16x16  = 2,
+} cudnnBackendTensorReordering_t;
+
+typedef enum {
+    CUDNN_ZERO_PAD     = 0,
+    CUDNN_NEG_INF_PAD  = 1,
+    CUDNN_EDGE_VAL_PAD = 2,
+} cudnnPaddingMode_t;
+
+typedef enum {
+    CUDNN_LAYER_NORM    = 0,
+    CUDNN_INSTANCE_NORM = 1,
+    CUDNN_BATCH_NORM    = 2,
+    CUDNN_GROUP_NORM    = 3,
+    CUDNN_RMS_NORM      = 4,
+} cudnnBackendNormMode_t;
+
+typedef enum {
+    CUDNN_NORM_FWD_INFERENCE = 0,
+    CUDNN_NORM_FWD_TRAINING  = 1,
+} cudnnBackendNormFwdPhase_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t elementCount,
+                         const void *arrayOfElements);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t requestedElementCount,
+                         int64_t *elementCount,
+                         void *arrayOfElements);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_GRAPH_H_ */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops.h b/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..ae4604dacdca127243253c4e3743ff71bec78f84
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops.h
@@ -0,0 +1,1316 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*
+ *  cudnn_ops : cuDNN's basic definitions and basic operations.
+ */
+
+#if !defined(CUDNN_OPS_H_)
+#define CUDNN_OPS_H_
+
+#include <stdint.h>
+
+#include "cudnn_version.h"
+#include "cudnn_graph.h"
+
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_OPS_MAJOR 9
+#define CUDNN_OPS_MINOR 1
+#define CUDNN_OPS_PATCH 0
+
+#if (CUDNN_OPS_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_MINOR != CUDNN_MINOR) || (CUDNN_OPS_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN OPS INFER!!!
+#endif
+
+#if defined(__cplusplus)
+extern "C" {
+#endif
+
+/* Data structures to represent Image/Filter and the Neural Network Layer */
+typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
+typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
+typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
+typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
+typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t CUDNN_DEPRECATED;
+/*
+ * CUDNN Determinism
+ */
+typedef enum {
+    CUDNN_NON_DETERMINISTIC = 0,
+    CUDNN_DETERMINISTIC     = 1,
+} cudnnDeterminism_t;
+
+/* Create an instance of a generic Tensor descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnTensorFormat_t format,
+                           cudnnDataType_t dataType, /* image data type */
+                           int n,                    /* number of inputs (batch size) */
+                           int c,                    /* number of input feature maps */
+                           int h,                    /* height of input section */
+                           int w);                   /* width of input section */
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnDataType_t dataType, /* image data type */
+                             int n,                    /* number of inputs (batch size) */
+                             int c,                    /* number of input feature maps */
+                             int h,                    /* height of input section */
+                             int w,                    /* width of input section */
+                             int nStride,
+                             int cStride,
+                             int hStride,
+                             int wStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           int *n,                    /* number of inputs (batch size) */
+                           int *c,                    /* number of input feature maps  */
+                           int *h,                    /* height of input section */
+                           int *w,                    /* width of input section */
+                           int *nStride,
+                           int *cStride,
+                           int *hStride,
+                           int *wStride);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t dataType,
+                           int nbDims,
+                           const int dimA[],
+                           const int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnTensorFormat_t format,
+                             cudnnDataType_t dataType,
+                             int nbDims,
+                             const int dimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType,
+                           int *nbDims,
+                           int dimA[],
+                           int strideA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
+
+/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
+
+   1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
+   input_stride :  c x h x h_stride
+   feature_stride : h x h_stride
+   h_stride  :  >= w  ( h_stride = w if no padding)
+   w_stride  : 1
+
+
+   2)Example of all images in row major with features maps interleaved
+   input_stride :  c x h x h_stride
+   feature_stride : 1
+   h_stride  :  w x c
+   w_stride  : c
+
+   3)Example of all images in column major order one batch of features after the other (with optional padding on column)
+   input_stride :  c x w x w_stride
+   feature_stride : w x w_stride
+   h_stride  :  1
+   w_stride  :  >= h
+
+*/
+
+/* Destroy an instance of Tensor4d descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
+
+/* Fold/unfold transforms */
+typedef enum {
+    CUDNN_TRANSFORM_FOLD   = 0U,
+    CUDNN_TRANSFORM_UNFOLD = 1U,
+} cudnnFoldingDirection_t;
+
+/** Create a destination descriptor for cudnnTransformTensor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       cudnnTensorDescriptor_t destDesc,
+                       size_t *destSizeInBytes);
+
+/** Create an empty tensor transform descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
+
+/** Initialize a previously created tensor transform descriptor. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  const uint32_t nbDims,
+                                  const cudnnTensorFormat_t destFormat,
+                                  const int32_t padBeforeA[],
+                                  const int32_t padAfterA[],
+                                  const uint32_t foldA[],
+                                  const cudnnFoldingDirection_t direction);
+
+/**
+ * Retrieves the values stored in a previously initialized tensor transform
+ * descriptor.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  uint32_t nbDimsRequested,
+                                  cudnnTensorFormat_t *destFormat,
+                                  int32_t padBeforeA[],
+                                  int32_t padAfterA[],
+                                  uint32_t foldA[],
+                                  cudnnFoldingDirection_t *direction);
+
+/**
+ * Destroys a previously created tensor transform descriptor.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
+
+/* Tensor layout conversion helper (y = alpha * x + beta * y) */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensor(cudnnHandle_t handle,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t yDesc,
+                     void *y);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensorEx(cudnnHandle_t handle,
+                       const cudnnTensorTransformDescriptor_t transDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       const void *srcData,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t destDesc,
+                       void *destData);
+
+/* Tensor Bias addition : C = alpha * A + beta * C  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnAddTensor(cudnnHandle_t handle,
+               const void *alpha,
+               const cudnnTensorDescriptor_t aDesc,
+               const void *A,
+               const void *beta,
+               const cudnnTensorDescriptor_t cDesc,
+               void *C);
+
+/*
+ * CUDNN OpTensor op type
+ */
+typedef enum {
+    CUDNN_OP_TENSOR_ADD  = 0,
+    CUDNN_OP_TENSOR_MUL  = 1,
+    CUDNN_OP_TENSOR_MIN  = 2,
+    CUDNN_OP_TENSOR_MAX  = 3,
+    CUDNN_OP_TENSOR_SQRT = 4,
+    CUDNN_OP_TENSOR_NOT  = 5,
+} cudnnOpTensorOp_t;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t opTensorOp,
+                           cudnnDataType_t opTensorCompType,
+                           cudnnNanPropagation_t opTensorNanOpt);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t *opTensorOp,
+                           cudnnDataType_t *opTensorCompType,
+                           cudnnNanPropagation_t *opTensorNanOpt);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
+
+/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
+/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnOpTensor(cudnnHandle_t handle,
+              const cudnnOpTensorDescriptor_t opTensorDesc,
+              const void *alpha1,
+              const cudnnTensorDescriptor_t aDesc,
+              const void *A,
+              const void *alpha2,
+              const cudnnTensorDescriptor_t bDesc,
+              const void *B,
+              const void *beta,
+              const cudnnTensorDescriptor_t cDesc,
+              void *C);
+
+/*
+ * CUDNN ReduceTensor indices type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_NO_INDICES        = 0,
+    CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
+} cudnnReduceTensorIndices_t CUDNN_DEPRECATED;
+
+/*
+ * CUDNN tensor indices type size (all unsigned)
+ * Currently not supported, default is 32 bit unsigned.
+ */
+typedef enum {
+    CUDNN_32BIT_INDICES = 0,
+    CUDNN_64BIT_INDICES = 1,
+    CUDNN_16BIT_INDICES = 2,
+    CUDNN_8BIT_INDICES  = 3,
+} cudnnIndicesType_t CUDNN_DEPRECATED;
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t reduceTensorOp,
+                               cudnnDataType_t reduceTensorCompType,
+                               cudnnNanPropagation_t reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t reduceTensorIndices,
+                               cudnnIndicesType_t reduceTensorIndicesType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t *reduceTensorOp,
+                               cudnnDataType_t *reduceTensorCompType,
+                               cudnnNanPropagation_t *reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t *reduceTensorIndices,
+                               cudnnIndicesType_t *reduceTensorIndicesType);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
+
+/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
+ * output tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionIndicesSize(cudnnHandle_t handle,
+                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                             const cudnnTensorDescriptor_t aDesc,
+                             const cudnnTensorDescriptor_t cDesc,
+                             size_t *sizeInBytes);
+
+/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
+ * tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
+                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               const cudnnTensorDescriptor_t aDesc,
+                               const cudnnTensorDescriptor_t cDesc,
+                               size_t *sizeInBytes);
+
+/* Tensor operation : C = reduce op( alpha * A ) + beta * C */
+/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
+/* The indices space is ignored for reduce ops other than min or max. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnReduceTensor(cudnnHandle_t handle,
+                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                  void *indices,
+                  size_t indicesSizeInBytes,
+                  void *workspace,
+                  size_t workspaceSizeInBytes,
+                  const void *alpha,
+                  const cudnnTensorDescriptor_t aDesc,
+                  const void *A,
+                  const void *beta,
+                  const cudnnTensorDescriptor_t cDesc,
+                  void *C);
+
+/* Set all values of a tensor to a given value : y[i] = value[0] */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
+
+/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
+
+/* Create an instance of FilterStruct */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int k,  /* number of output feature maps */
+                           int c,  /* number of input feature maps */
+                           int h,  /* height of each input filter */
+                           int w); /* width of  each input filter */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *k,  /* number of output feature maps */
+                           int *c,  /* number of input feature maps */
+                           int *h,  /* height of each input filter */
+                           int *w); /* width of  each input filter */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int nbDims,
+                           const int filterDimA[]);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *nbDims,
+                           int filterDimA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformFilter(cudnnHandle_t handle,
+                     const cudnnTensorTransformDescriptor_t transDesc,
+                     const void *alpha,
+                     const cudnnFilterDescriptor_t srcDesc,
+                     const void *srcData,
+                     const void *beta,
+                     const cudnnFilterDescriptor_t destDesc,
+                     void *destData);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
+
+/*
+ *  softmax algorithm
+ */
+typedef enum {
+    CUDNN_SOFTMAX_FAST     = 0, /* straightforward implementation */
+    CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
+    CUDNN_SOFTMAX_LOG      = 2
+} cudnnSoftmaxAlgorithm_t;
+
+typedef enum {
+    CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
+    CUDNN_SOFTMAX_MODE_CHANNEL  = 1  /* compute the softmax over all C for each H, W, N */
+} cudnnSoftmaxMode_t;
+
+/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform forward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxForward(cudnnHandle_t handle,
+                    cudnnSoftmaxAlgorithm_t algo,
+                    cudnnSoftmaxMode_t mode,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+
+/*
+ *  pooling mode
+ */
+typedef enum {
+    CUDNN_POOLING_MAX                           = 0,
+    CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
+    CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
+    CUDNN_POOLING_MAX_DETERMINISTIC             = 3
+} cudnnPoolingMode_t CUDNN_DEPRECATED;
+
+/* Create an instance of pooling descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t mode,
+                            cudnnNanPropagation_t maxpoolingNanOpt,
+                            int windowHeight,
+                            int windowWidth,
+                            int verticalPadding,
+                            int horizontalPadding,
+                            int verticalStride,
+                            int horizontalStride);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *windowHeight,
+                            int *windowWidth,
+                            int *verticalPadding,
+                            int *horizontalPadding,
+                            int *verticalStride,
+                            int *horizontalStride);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            const cudnnPoolingMode_t mode,
+                            const cudnnNanPropagation_t maxpoolingNanOpt,
+                            int nbDims,
+                            const int windowDimA[],
+                            const int paddingA[],
+                            const int strideA[]);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            int nbDimsRequested,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *nbDims,
+                            int windowDimA[],
+                            int paddingA[],
+                            int strideA[]);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims,
+                                  int outputTensorDimA[]);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n,
+                                  int *c,
+                                  int *h,
+                                  int *w);
+
+/* Destroy an instance of pooling descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
+
+/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+
+/* Function to perform forward pooling */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnPoolingForward(cudnnHandle_t handle,
+                    const cudnnPoolingDescriptor_t poolingDesc,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+
+/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t mode,
+                             cudnnNanPropagation_t reluNanOpt,
+                             double coef); /* ceiling for clipped RELU, alpha for ELU */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt,
+                             double *coef); /* ceiling for clipped RELU, alpha for ELU */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
+
+/* Function to perform forward activation  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnActivationForward(cudnnHandle_t handle,
+                       cudnnActivationDescriptor_t activationDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t xDesc,
+                       const void *x,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t yDesc,
+                       void *y);
+
+/*
+ * Create an instance of LRN (Local Response Normalization) descriptor
+ * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
+
+#define CUDNN_LRN_MIN_N 1       /* minimum allowed lrnN */
+#define CUDNN_LRN_MAX_N 16      /* maximum allowed lrnN */
+#define CUDNN_LRN_MIN_K 1e-5    /* minimum allowed lrnK */
+#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
+
+/* LRN layer mode */
+typedef enum {
+    CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
+} cudnnLRNMode_t;
+
+/*
+ * Uses a window [center-lookBehind, center+lookAhead], where
+ * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
+ * Values of double parameters cast to tensor data type.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
+/*
+ * Retrieve the settings currently stored in an LRN layer descriptor
+ * Any of the provided pointers can be NULL (no corresponding value will be returned)
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
+
+/* Destroy an instance of LRN descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
+
+/* LRN functions: output = alpha * normalize(x) + beta * old_y */
+
+/* LRN cross-channel forward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelForward(cudnnHandle_t handle,
+                            cudnnLRNDescriptor_t normDesc,
+                            cudnnLRNMode_t lrnMode,
+                            const void *alpha,
+                            const cudnnTensorDescriptor_t xDesc,
+                            const void *x,
+                            const void *beta,
+                            const cudnnTensorDescriptor_t yDesc,
+                            void *y);
+
+typedef enum {
+    CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
+} cudnnDivNormMode_t;
+
+/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
+                                  cudnnLRNDescriptor_t normDesc,
+                                  cudnnDivNormMode_t mode,
+                                  const void *alpha,
+                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+                                  const void *x,
+                                  const void *means, /* if NULL, means are assumed to be zero */
+                                  void *temp,
+                                  void *temp2,
+                                  const void *beta,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *y);
+
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_BATCHNORM_PER_ACTIVATION = 0,
+
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_BATCHNORM_SPATIAL = 1,
+
+    /*
+     * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
+     * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
+     */
+    CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
+} cudnnBatchNormMode_t CUDNN_DEPRECATED;
+
+#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
+
+/*
+ * Derives a tensor descriptor from layer data descriptor for BatchNormalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
+                              const cudnnTensorDescriptor_t xDesc,
+                              cudnnBatchNormMode_t mode);
+
+typedef enum {
+    CUDNN_BATCHNORM_OPS_BN                = 0, /* do batch normalization only */
+    CUDNN_BATCHNORM_OPS_BN_ACTIVATION     = 1, /* do batchNorm, then activation */
+    CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
+} cudnnBatchNormOps_t CUDNN_DEPRECATED;
+
+/*
+ * Performs Batch Normalization during Inference:
+ * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
+ * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
+ * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
+                                        cudnnBatchNormMode_t mode,
+                                        const void *alpha, /* alpha[0] = result blend factor */
+                                        const void *beta,  /* beta[0] = dest layer blend factor */
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const void *x, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        void *y, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        const void *bnScale,
+                                        const void *bnBias,
+                                        const void *estimatedMean,
+                                        const void *estimatedVariance,
+                                        double epsilon);
+
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_NORM_PER_ACTIVATION = 0,
+
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_NORM_PER_CHANNEL = 1,
+} cudnnNormMode_t CUDNN_DEPRECATED;
+
+typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t CUDNN_DEPRECATED;
+
+/*
+ * Derives a tensor descriptor from layer data descriptor for Normalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
+                                cudnnTensorDescriptor_t derivedNormMeanVarDesc,
+                                const cudnnTensorDescriptor_t xDesc,
+                                cudnnNormMode_t mode,
+                                int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+typedef enum {
+    CUDNN_NORM_OPS_NORM                = 0, /* do normalization only */
+    CUDNN_NORM_OPS_NORM_ACTIVATION     = 1, /* do Norm, then activation */
+    CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
+} cudnnNormOps_t CUDNN_DEPRECATED;
+
+/*
+ * Performs Normalization during Inference:
+ * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
+ * with normScale, normBias, runningMean, runningInvVariance tensors indexed
+ * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardInference(cudnnHandle_t handle,
+                                   cudnnNormMode_t mode,
+                                   cudnnNormOps_t normOps,
+                                   cudnnNormAlgo_t algo,
+                                   const void *alpha, /* alpha[0] = result blend factor */
+                                   const void *beta,  /* beta[0] = dest layer blend factor */
+                                   const cudnnTensorDescriptor_t xDesc,
+                                   const void *x, /* NxCxHxW */
+                                   const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                   const void *normScale,
+                                   const void *normBias,
+                                   const cudnnTensorDescriptor_t normMeanVarDesc,
+                                   const void *estimatedMean,
+                                   const void *estimatedVariance,
+                                   const cudnnTensorDescriptor_t zDesc,
+                                   const void *z,
+                                   cudnnActivationDescriptor_t activationDesc,
+                                   const cudnnTensorDescriptor_t yDesc,
+                                   void *y, /* NxCxHxW */
+                                   double epsilon,
+                                   int groupCnt); /* Place hold for future work*/
+
+/* APIs for spatial transformer network*/
+typedef enum {
+    CUDNN_SAMPLER_BILINEAR = 0,
+} cudnnSamplerType_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
+                                       cudnnSamplerType_t samplerType,
+                                       cudnnDataType_t dataType,
+                                       const int nbDims,
+                                       const int dimA[]);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
+                                   const cudnnSpatialTransformerDescriptor_t stDesc,
+                                   const void *theta,
+                                   void *grid);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
+                             cudnnSpatialTransformerDescriptor_t stDesc,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *grid,
+                             const void *beta,
+                             cudnnTensorDescriptor_t yDesc,
+                             void *y);
+
+typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
+
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
+
+/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
+
+/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float dropout,
+                          void *states,
+                          size_t stateSizeInBytes,
+                          unsigned long long seed);
+
+/* Restores the dropout descriptor to a previously saved-off state */
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                              cudnnHandle_t handle,
+                              float dropout,
+                              void *states,
+                              size_t stateSizeInBytes,
+                              unsigned long long seed);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float *dropout,
+                          void **states,
+                          unsigned long long *seed);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutForward(cudnnHandle_t handle,
+                    const cudnnDropoutDescriptor_t dropoutDesc,
+                    const cudnnTensorDescriptor_t xdesc,
+                    const void *x,
+                    const cudnnTensorDescriptor_t ydesc,
+                    void *y,
+                    void *reserveSpace,
+                    size_t reserveSpaceSizeInBytes);
+
+/* TODO: move these enums out to the appropriate submodule */
+typedef enum {
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM         = 0,
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
+    CUDNN_CONVOLUTION_FWD_ALGO_GEMM                  = 2,
+    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT                = 3,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT                   = 4,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING            = 5,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD              = 6,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED     = 7,
+    CUDNN_CONVOLUTION_FWD_ALGO_COUNT                 = 8
+} cudnnConvolutionFwdAlgo_t;
+
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3                 = 3, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD          = 4, /* not implemented */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING        = 6,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT             = 7
+} cudnnConvolutionBwdFilterAlgo_t;
+
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING        = 3,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD          = 4,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT             = 6
+} cudnnConvolutionBwdDataAlgo_t;
+
+typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
+
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnOpsVersionCheck(void);
+
+/* Function to perform backward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxBackward(cudnnHandle_t handle,
+                     cudnnSoftmaxAlgorithm_t algo,
+                     cudnnSoftmaxMode_t mode,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+
+/* Function to perform backward pooling */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnPoolingBackward(cudnnHandle_t handle,
+                     const cudnnPoolingDescriptor_t poolingDesc,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+
+/* Function to perform backward activation  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnActivationBackward(cudnnHandle_t handle,
+                        cudnnActivationDescriptor_t activationDesc,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t yDesc,
+                        const void *y,
+                        const cudnnTensorDescriptor_t dyDesc,
+                        const void *dy,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t dxDesc,
+                        void *dx);
+
+/* LRN cross-channel backward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
+                             cudnnLRNDescriptor_t normDesc,
+                             cudnnLRNMode_t lrnMode,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t yDesc,
+                             const void *y,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
+                                   cudnnLRNDescriptor_t normDesc,
+                                   cudnnDivNormMode_t mode,
+                                   const void *alpha,
+                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                   const void *x,
+                                   const void *means, /* if NULL, means are assumed to be zero */
+                                   const void *dy,
+                                   void *temp,
+                                   void *temp2,
+                                   const void *beta,
+                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+                                   void *dx,                                   /* output x differential */
+                                   void *dMeans); /* output means differential, can be NULL */
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
+                                                         cudnnBatchNormMode_t mode,
+                                                         cudnnBatchNormOps_t bnOps,
+                                                         const cudnnTensorDescriptor_t xDesc,
+                                                         const cudnnTensorDescriptor_t zDesc,
+                                                         const cudnnTensorDescriptor_t yDesc,
+                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                                         const cudnnActivationDescriptor_t activationDesc,
+                                                         size_t *sizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnBatchNormMode_t mode,
+                                                  cudnnBatchNormOps_t bnOps,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t dyDesc,
+                                                  const cudnnTensorDescriptor_t dzDesc,
+                                                  const cudnnTensorDescriptor_t dxDesc,
+                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  size_t *sizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
+                                                     cudnnBatchNormMode_t mode,
+                                                     cudnnBatchNormOps_t bnOps,
+                                                     const cudnnActivationDescriptor_t activationDesc,
+                                                     const cudnnTensorDescriptor_t xDesc,
+                                                     size_t *sizeInBytes);
+
+/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc,
+    void *y, /* NxCxHxW */
+
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
+        (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+    const void *bnScale,
+    const void *bnBias,
+
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
+
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance);
+
+/* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    cudnnBatchNormOps_t bnOps,
+
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+
+    const cudnnTensorDescriptor_t xDesc,
+    const void *xData,
+    const cudnnTensorDescriptor_t zDesc,
+    const void *zData,
+    const cudnnTensorDescriptor_t yDesc,
+    void *yData,
+
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const void *bnScale,
+    const void *bnBias,
+
+    double exponentialAverageFactor,
+    void *resultRunningMean,
+    void *resultRunningVariance,
+
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance,
+
+    cudnnActivationDescriptor_t activationDesc,
+    void *workspace,
+    size_t workSpaceSizeInBytes,
+    void *reserveSpace,
+    size_t reserveSpaceSizeInBytes);
+
+/* Performs backward pass of Batch Normalization layer. Returns x gradient,
+* bnScale gradient and bnBias gradient */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackward(cudnnHandle_t handle,
+                                cudnnBatchNormMode_t mode,
+                                const void *alphaDataDiff,
+                                const void *betaDataDiff,
+                                const void *alphaParamDiff,
+                                const void *betaParamDiff,
+                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+                                const void *x,
+                                const cudnnTensorDescriptor_t dyDesc,
+                                const void *dy,
+                                const cudnnTensorDescriptor_t dxDesc,
+                                void *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                const void *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void *dBnScaleResult,
+                                void *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double epsilon,
+
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void *savedMean,
+                                const void *savedInvVariance);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
+                                  cudnnBatchNormMode_t mode,
+                                  cudnnBatchNormOps_t bnOps,
+
+                                  const void *alphaDataDiff,
+                                  const void *betaDataDiff,
+                                  const void *alphaParamDiff,
+                                  const void *betaParamDiff,
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  const void *yData,
+                                  const cudnnTensorDescriptor_t dyDesc,
+                                  const void *dyData,
+                                  const cudnnTensorDescriptor_t dzDesc,
+                                  void *dzData,
+                                  const cudnnTensorDescriptor_t dxDesc,
+                                  void *dxData,
+
+                                  /* Shared tensor desc for the 4 tensors below */
+                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                  const void *bnScaleData,
+                                  const void *bnBiasData, /* needed if there is activation */
+                                  void *dBnScaleData,
+                                  void *dBnBiasData,
+                                  double epsilon, /* Same epsilon as forward pass */
+
+                                  /* Optionally cached intermediate results from
+                                     forward pass */
+                                  const void *savedMean,
+                                  const void *savedInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  void *workSpace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes);
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnNormMode_t mode,
+                                                  cudnnNormOps_t normOps,
+                                                  cudnnNormAlgo_t algo,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t zDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                                  size_t *sizeInBytes,
+                                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle,
+                                           cudnnNormMode_t mode,
+                                           cudnnNormOps_t normOps,
+                                           cudnnNormAlgo_t algo,
+                                           const cudnnTensorDescriptor_t xDesc,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           const cudnnTensorDescriptor_t dyDesc,
+                                           const cudnnTensorDescriptor_t dzDesc,
+                                           const cudnnTensorDescriptor_t dxDesc,
+                                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                                           const cudnnActivationDescriptor_t activationDesc,
+                                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                                           size_t *sizeInBytes,
+                                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle,
+                                              cudnnNormMode_t mode,
+                                              cudnnNormOps_t normOps,
+                                              cudnnNormAlgo_t algo,
+                                              const cudnnActivationDescriptor_t activationDesc,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              size_t *sizeInBytes,
+                                              int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+/* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardTraining(cudnnHandle_t handle,
+                                  cudnnNormMode_t mode,
+                                  cudnnNormOps_t normOps,
+                                  cudnnNormAlgo_t algo,
+                                  const void *alpha, /* alpha[0] = result blend factor */
+                                  const void *beta,  /* beta[0] = dest layer blend factor */
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                  const void *normScale,
+                                  const void *normBias,
+                                  double exponentialAverageFactor,
+                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                  void *resultRunningMean,
+                                  void *resultRunningVariance,
+                                  /* Has to be >= 0. Should be the same in forward and backward functions. */
+                                  double epsilon,
+                                  /* Optionally save intermediate results from the forward pass here
+                                     - can be reused to speed up backward pass. NULL if unused */
+                                  void *resultSaveMean,
+                                  void *resultSaveInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  const cudnnTensorDescriptor_t zDesc,
+                                  const void *zData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *yData,
+                                  void *workspace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationBackward(cudnnHandle_t handle,
+                           cudnnNormMode_t mode,
+                           cudnnNormOps_t normOps,
+                           cudnnNormAlgo_t algo,
+                           const void *alphaDataDiff,
+                           const void *betaDataDiff,
+                           const void *alphaParamDiff,
+                           const void *betaParamDiff,
+                           const cudnnTensorDescriptor_t xDesc,
+                           const void *xData,
+                           const cudnnTensorDescriptor_t yDesc,
+                           const void *yData,
+                           const cudnnTensorDescriptor_t dyDesc,
+                           const void *dyData,
+                           const cudnnTensorDescriptor_t dzDesc,
+                           void *dzData,
+                           const cudnnTensorDescriptor_t dxDesc,
+                           void *dxData,
+                           /* Shared tensor desc for the 4 tensors below */
+                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                           const void *normScaleData,
+                           const void *normBiasData, /* needed if there is activation */
+                           void *dNormScaleData,
+                           void *dNormBiasData,
+                           double epsilon, /* Same epsilon as forward pass */
+                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                           /* Optionally cached intermediate results from
+                              forward pass */
+                           const void *savedMean,
+                           const void *savedInvVariance,
+                           cudnnActivationDescriptor_t activationDesc,
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes,
+                           void *reserveSpace,
+                           size_t reserveSpaceSizeInBytes,
+                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
+                                    const cudnnSpatialTransformerDescriptor_t stDesc,
+                                    const void *dgrid,
+                                    void *dtheta);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
+                              cudnnSpatialTransformerDescriptor_t stDesc,
+                              const void *alpha,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const void *x,
+                              const void *beta,
+                              const cudnnTensorDescriptor_t dxDesc,
+                              void *dx,
+                              const void *alphaDgrid,
+                              const cudnnTensorDescriptor_t dyDesc,
+                              const void *dy,
+                              const void *grid,
+                              const void *betaDgrid,
+                              void *dgrid);
+
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutBackward(cudnnHandle_t handle,
+                     const cudnnDropoutDescriptor_t dropoutDesc,
+                     const cudnnTensorDescriptor_t dydesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dxdesc,
+                     void *dx,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes);
+
+#if defined(__cplusplus)
+}
+#endif
+
+#endif /* CUDNN_OPS_H_ */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version_v9.h b/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version_v9.h
new file mode 100644
index 0000000000000000000000000000000000000000..51964033f41c8bd5e94886634a0425288091e383
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version_v9.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/**
+ * \file: The master cuDNN version file.
+ */
+
+#ifndef CUDNN_VERSION_H_
+#define CUDNN_VERSION_H_
+
+#define CUDNN_MAJOR 9
+#define CUDNN_MINOR 1
+#define CUDNN_PATCHLEVEL 0
+
+#define CUDNN_VERSION (CUDNN_MAJOR * 10000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
+
+/* cannot use constexpr here since this is a C-only file */
+/* Below is the max SM version this cuDNN library is aware of and supports natively */
+
+#define CUDNN_MAX_SM_MAJOR_NUMBER 9
+#define CUDNN_MAX_SM_MINOR_NUMBER 0
+#define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100 + CUDNN_MAX_SM_MINOR_NUMBER * 10)
+
+#endif /* CUDNN_VERSION_H */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusolver/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cusolver/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..679189c7c5d9d8323836135d90003f029178b83a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cusolver/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0960ec813fb70c92394376875f03d8074fe80580
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverDn.h b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverDn.h
new file mode 100644
index 0000000000000000000000000000000000000000..fbf1534a79e3fdc727bc520d4e3e898d4ac38ef2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverDn.h
@@ -0,0 +1,4927 @@
+/*
+ * Copyright 2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+/*   cuSolverDN : Dense Linear Algebra Library
+
+*/
+
+#if !defined(CUSOLVERDN_H_)
+  #define CUSOLVERDN_H_
+
+struct cusolverDnContext;
+typedef struct cusolverDnContext *cusolverDnHandle_t;
+
+struct syevjInfo;
+typedef struct syevjInfo *syevjInfo_t;
+
+struct gesvdjInfo;
+typedef struct gesvdjInfo *gesvdjInfo_t;
+
+//------------------------------------------------------
+// opaque cusolverDnIRS structure for IRS solver
+struct cusolverDnIRSParams;
+typedef struct cusolverDnIRSParams *cusolverDnIRSParams_t;
+
+struct cusolverDnIRSInfos;
+typedef struct cusolverDnIRSInfos *cusolverDnIRSInfos_t;
+//------------------------------------------------------
+
+struct cusolverDnParams;
+typedef struct cusolverDnParams *cusolverDnParams_t;
+
+typedef enum {
+  CUSOLVERDN_GETRF = 0,
+  CUSOLVERDN_POTRF = 1
+} cusolverDnFunction_t;
+
+typedef enum {
+  CUSOLVER_DETERMINISTIC_RESULTS = 1,
+  CUSOLVER_ALLOW_NON_DETERMINISTIC_RESULTS = 2
+} cusolverDeterministicMode_t;
+
+  #include <stdio.h>
+
+  #include "cuComplex.h" /* import complex data type */
+  #include "cublas_v2.h"
+  #include "cusolver_common.h"
+
+  /*******************************************************************************/
+  #ifdef __cplusplus
+extern "C" {
+  #endif
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCreate(cusolverDnHandle_t *handle);
+  cusolverStatus_t CUSOLVERAPI cusolverDnDestroy(cusolverDnHandle_t handle);
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnSetStream(cusolverDnHandle_t handle, cudaStream_t streamId);
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnGetStream(cusolverDnHandle_t handle, cudaStream_t *streamId);
+
+  //============================================================
+  // Deterministic Mode
+  //============================================================
+  cusolverStatus_t CUSOLVERAPI cusolverDnSetDeterministicMode(cusolverDnHandle_t
+    handle, cusolverDeterministicMode_t mode);
+  cusolverStatus_t CUSOLVERAPI cusolverDnGetDeterministicMode(cusolverDnHandle_t
+    handle, cusolverDeterministicMode_t* mode);
+
+  //============================================================
+  // IRS headers
+  //============================================================
+
+  // =============================================================================
+  // IRS helper function API
+  // =============================================================================
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsCreate(cusolverDnIRSParams_t *params_ptr);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsDestroy(cusolverDnIRSParams_t params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetRefinementSolver(
+    cusolverDnIRSParams_t   params,
+    cusolverIRSRefinement_t refinement_solver);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverMainPrecision(
+    cusolverDnIRSParams_t params,
+    cusolverPrecType_t    solver_main_precision);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverLowestPrecision(
+    cusolverDnIRSParams_t params,
+    cusolverPrecType_t    solver_lowest_precision);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetSolverPrecisions(
+    cusolverDnIRSParams_t params,
+    cusolverPrecType_t    solver_main_precision,
+    cusolverPrecType_t    solver_lowest_precision);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsSetTol(cusolverDnIRSParams_t params, double val);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsSetTolInner(cusolverDnIRSParams_t params, double val);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetMaxIters(
+    cusolverDnIRSParams_t params,
+    cusolver_int_t        maxiters);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsSetMaxItersInner(
+    cusolverDnIRSParams_t params,
+    cusolver_int_t        maxiters_inner);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSParamsGetMaxIters(
+    cusolverDnIRSParams_t params,
+    cusolver_int_t *      maxiters);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsEnableFallback(cusolverDnIRSParams_t params);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSParamsDisableFallback(cusolverDnIRSParams_t params);
+
+  // =============================================================================
+  // cusolverDnIRSInfos prototypes
+  // =============================================================================
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSInfosDestroy(cusolverDnIRSInfos_t infos);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSInfosCreate(cusolverDnIRSInfos_t *infos_ptr);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetNiters(
+    cusolverDnIRSInfos_t infos,
+    cusolver_int_t *     niters);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetOuterNiters(
+    cusolverDnIRSInfos_t infos,
+    cusolver_int_t *     outer_niters);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnIRSInfosRequestResidual(cusolverDnIRSInfos_t infos);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetResidualHistory(
+    cusolverDnIRSInfos_t infos,
+    void **              residual_history);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSInfosGetMaxIters(
+    cusolverDnIRSInfos_t infos,
+    cusolver_int_t *     maxiters);
+
+  //============================================================
+  //  IRS functions API
+  //============================================================
+
+  /*******************************************************************************/ /*
+                                                                                     * [ZZ, ZC, ZK, ZE, ZY, CC, CK, CE, CY, DD, DS, DH, DB, DX, SS, SH, SB, SX]gesv
+                                                                                     * users API Prototypes */
+  /*******************************************************************************/
+  cusolverStatus_t CUSOLVERAPI cusolverDnZZgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZCgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZKgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZEgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZYgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCCgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCEgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCKgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCYgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDDgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDSgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDHgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDBgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDXgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSSgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSHgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSBgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSXgesv(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  /*******************************************************************************/
+
+  /*******************************************************************************/ /*
+                                                                                     * [ZZ, ZC, ZK, ZE, ZY, CC, CK, CE, CY, DD, DS, DH, DB, DX, SS, SH, SB, SX]gesv_bufferSize
+                                                                                     * users API Prototypes */
+  /*******************************************************************************/
+  cusolverStatus_t CUSOLVERAPI cusolverDnZZgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZCgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZKgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZEgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZYgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCCgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCKgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCEgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCYgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDDgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDSgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDHgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDBgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDXgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSSgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSHgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSBgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSXgesv_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    cusolver_int_t *   dipiv,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+  /*******************************************************************************/
+
+  /*******************************************************************************/ /*
+                                                                                     * [ZZ, ZC, ZK, ZE, ZY, CC, CK, CE, CY, DD, DS, DH, DB, DX, SS, SH, SB, SX]gels
+                                                                                     * users API Prototypes */
+  /*******************************************************************************/
+  cusolverStatus_t CUSOLVERAPI cusolverDnZZgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZCgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZKgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZEgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZYgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCCgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCKgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCEgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCYgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDDgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDSgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDHgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDBgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDXgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSSgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSHgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSBgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSXgels(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t             lwork_bytes,
+    cusolver_int_t *   iter,
+    cusolver_int_t *   d_info);
+  /*******************************************************************************/
+
+  /*******************************************************************************/ /*
+                                                                                     * [ZZ, ZC, ZK, ZE, ZY, CC, CK, CE, CY, DD, DS, DH, DB, DX, SS, SH, SB, SX]gels_bufferSize
+                                                                                     * API prototypes */
+  /*******************************************************************************/
+  cusolverStatus_t CUSOLVERAPI cusolverDnZZgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZCgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZKgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZEgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZYgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuDoubleComplex *  dA,
+    cusolver_int_t     ldda,
+    cuDoubleComplex *  dB,
+    cusolver_int_t     lddb,
+    cuDoubleComplex *  dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCCgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCKgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCEgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCYgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    cuComplex *        dA,
+    cusolver_int_t     ldda,
+    cuComplex *        dB,
+    cusolver_int_t     lddb,
+    cuComplex *        dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDDgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDSgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDHgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDBgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDXgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    double *           dA,
+    cusolver_int_t     ldda,
+    double *           dB,
+    cusolver_int_t     lddb,
+    double *           dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSSgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSHgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSBgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSXgels_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolver_int_t     m,
+    cusolver_int_t     n,
+    cusolver_int_t     nrhs,
+    float *            dA,
+    cusolver_int_t     ldda,
+    float *            dB,
+    cusolver_int_t     lddb,
+    float *            dX,
+    cusolver_int_t     lddx,
+    void *             dWorkspace,
+    size_t *           lwork_bytes);
+  /*******************************************************************************/
+
+  /*******************************************************************************/ /*
+                                                                                     * expert users API for IRS Prototypes
+                                                                                     * */
+  /*******************************************************************************/
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgesv(
+    cusolverDnHandle_t    handle,
+    cusolverDnIRSParams_t gesv_irs_params,
+    cusolverDnIRSInfos_t  gesv_irs_infos,
+    cusolver_int_t        n,
+    cusolver_int_t        nrhs,
+    void *                dA,
+    cusolver_int_t        ldda,
+    void *                dB,
+    cusolver_int_t        lddb,
+    void *                dX,
+    cusolver_int_t        lddx,
+    void *                dWorkspace,
+    size_t                lwork_bytes,
+    cusolver_int_t *      niters,
+    cusolver_int_t *      d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgesv_bufferSize(
+    cusolverDnHandle_t    handle,
+    cusolverDnIRSParams_t params,
+    cusolver_int_t        n,
+    cusolver_int_t        nrhs,
+    size_t *              lwork_bytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgels(
+    cusolverDnHandle_t    handle,
+    cusolverDnIRSParams_t gels_irs_params,
+    cusolverDnIRSInfos_t  gels_irs_infos,
+    cusolver_int_t        m,
+    cusolver_int_t        n,
+    cusolver_int_t        nrhs,
+    void *                dA,
+    cusolver_int_t        ldda,
+    void *                dB,
+    cusolver_int_t        lddb,
+    void *                dX,
+    cusolver_int_t        lddx,
+    void *                dWorkspace,
+    size_t                lwork_bytes,
+    cusolver_int_t *      niters,
+    cusolver_int_t *      d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnIRSXgels_bufferSize(
+    cusolverDnHandle_t    handle,
+    cusolverDnIRSParams_t params,
+    cusolver_int_t        m,
+    cusolver_int_t        n,
+    cusolver_int_t        nrhs,
+    size_t *              lwork_bytes);
+  /*******************************************************************************/
+
+  /* Cholesky factorization and its solver */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSpotrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    int *              Lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDpotrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    int *              Lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCpotrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    int *              Lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZpotrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    int *              Lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSpotrf(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            Workspace,
+    int                Lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDpotrf(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           Workspace,
+    int                Lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCpotrf(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    cuComplex *        Workspace,
+    int                Lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZpotrf(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    cuDoubleComplex *  Workspace,
+    int                Lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSpotrs(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    int                nrhs,
+    const float *      A,
+    int                lda,
+    float *            B,
+    int                ldb,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDpotrs(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    int                nrhs,
+    const double *     A,
+    int                lda,
+    double *           B,
+    int                ldb,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCpotrs(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    int                nrhs,
+    const cuComplex *  A,
+    int                lda,
+    cuComplex *        B,
+    int                ldb,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZpotrs(
+    cusolverDnHandle_t     handle,
+    cublasFillMode_t       uplo,
+    int                    n,
+    int                    nrhs,
+    const cuDoubleComplex *A,
+    int                    lda,
+    cuDoubleComplex *      B,
+    int                    ldb,
+    int *                  devInfo);
+
+  /* batched Cholesky factorization and its solver */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSpotrfBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            Aarray[],
+    int                lda,
+    int *              infoArray,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDpotrfBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           Aarray[],
+    int                lda,
+    int *              infoArray,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCpotrfBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        Aarray[],
+    int                lda,
+    int *              infoArray,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZpotrfBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  Aarray[],
+    int                lda,
+    int *              infoArray,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSpotrsBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    int                nrhs, /* only support rhs = 1*/
+    float *            A[],
+    int                lda,
+    float *            B[],
+    int                ldb,
+    int *              d_info,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDpotrsBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    int                nrhs, /* only support rhs = 1*/
+    double *           A[],
+    int                lda,
+    double *           B[],
+    int                ldb,
+    int *              d_info,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCpotrsBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    int                nrhs, /* only support rhs = 1*/
+    cuComplex *        A[],
+    int                lda,
+    cuComplex *        B[],
+    int                ldb,
+    int *              d_info,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZpotrsBatched(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    int                nrhs, /* only support rhs = 1*/
+    cuDoubleComplex *  A[],
+    int                lda,
+    cuDoubleComplex *  B[],
+    int                ldb,
+    int *              d_info,
+    int                batchSize);
+
+  /* s.p.d. matrix inversion (POTRI) and auxiliary routines (TRTRI and LAUUM) */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSpotri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDpotri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCpotri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZpotri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSpotri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            work,
+    int                lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDpotri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           work,
+    int                lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCpotri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    cuComplex *        work,
+    int                lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZpotri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXtrtri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    cublasDiagType_t   diag,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    size_t *           workspaceInBytesOnDevice,
+    size_t *           workspaceInBytesOnHost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXtrtri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    cublasDiagType_t   diag,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    void *             bufferOnDevice,
+    size_t             workspaceInBytesOnDevice,
+    void *             bufferOnHost,
+    size_t             workspaceInBytesOnHost,
+    int *              devInfo);
+
+  /* lauum, auxiliar routine for s.p.d matrix inversion */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSlauum_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDlauum_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnClauum_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZlauum_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSlauum(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            work,
+    int                lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDlauum(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           work,
+    int                lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnClauum(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    cuComplex *        work,
+    int                lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZlauum(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              devInfo);
+
+  /* LU Factorization */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    float *            A,
+    int                lda,
+    int *              Lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    double *           A,
+    int                lda,
+    int *              Lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgetrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    int *              Lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgetrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    int *              Lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgetrf(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            Workspace,
+    int *              devIpiv,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgetrf(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           Workspace,
+    int *              devIpiv,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgetrf(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    cuComplex *        Workspace,
+    int *              devIpiv,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgetrf(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    cuDoubleComplex *  Workspace,
+    int *              devIpiv,
+    int *              devInfo);
+
+  /* Row pivoting */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSlaswp(
+    cusolverDnHandle_t handle,
+    int                n,
+    float *            A,
+    int                lda,
+    int                k1,
+    int                k2,
+    const int *        devIpiv,
+    int                incx);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDlaswp(
+    cusolverDnHandle_t handle,
+    int                n,
+    double *           A,
+    int                lda,
+    int                k1,
+    int                k2,
+    const int *        devIpiv,
+    int                incx);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnClaswp(
+    cusolverDnHandle_t handle,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    int                k1,
+    int                k2,
+    const int *        devIpiv,
+    int                incx);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZlaswp(
+    cusolverDnHandle_t handle,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    int                k1,
+    int                k2,
+    const int *        devIpiv,
+    int                incx);
+
+  /* LU solve */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgetrs(
+    cusolverDnHandle_t handle,
+    cublasOperation_t  trans,
+    int                n,
+    int                nrhs,
+    const float *      A,
+    int                lda,
+    const int *        devIpiv,
+    float *            B,
+    int                ldb,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgetrs(
+    cusolverDnHandle_t handle,
+    cublasOperation_t  trans,
+    int                n,
+    int                nrhs,
+    const double *     A,
+    int                lda,
+    const int *        devIpiv,
+    double *           B,
+    int                ldb,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgetrs(
+    cusolverDnHandle_t handle,
+    cublasOperation_t  trans,
+    int                n,
+    int                nrhs,
+    const cuComplex *  A,
+    int                lda,
+    const int *        devIpiv,
+    cuComplex *        B,
+    int                ldb,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgetrs(
+    cusolverDnHandle_t     handle,
+    cublasOperation_t      trans,
+    int                    n,
+    int                    nrhs,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const int *            devIpiv,
+    cuDoubleComplex *      B,
+    int                    ldb,
+    int *                  devInfo);
+
+  /* QR factorization */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    float *            A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    double *           A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgeqrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgeqrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgeqrf(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            TAU,
+    float *            Workspace,
+    int                Lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgeqrf(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           TAU,
+    double *           Workspace,
+    int                Lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgeqrf(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    cuComplex *        TAU,
+    cuComplex *        Workspace,
+    int                Lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgeqrf(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    cuDoubleComplex *  TAU,
+    cuDoubleComplex *  Workspace,
+    int                Lwork,
+    int *              devInfo);
+
+  /* generate unitary matrix Q from QR factorization */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int                k,
+    const float *      A,
+    int                lda,
+    const float *      tau,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int                k,
+    const double *     A,
+    int                lda,
+    const double *     tau,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCungqr_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int                k,
+    const cuComplex *  A,
+    int                lda,
+    const cuComplex *  tau,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZungqr_bufferSize(
+    cusolverDnHandle_t     handle,
+    int                    m,
+    int                    n,
+    int                    k,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const cuDoubleComplex *tau,
+    int *                  lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSorgqr(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int                k,
+    float *            A,
+    int                lda,
+    const float *      tau,
+    float *            work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDorgqr(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int                k,
+    double *           A,
+    int                lda,
+    const double *     tau,
+    double *           work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCungqr(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int                k,
+    cuComplex *        A,
+    int                lda,
+    const cuComplex *  tau,
+    cuComplex *        work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZungqr(
+    cusolverDnHandle_t     handle,
+    int                    m,
+    int                    n,
+    int                    k,
+    cuDoubleComplex *      A,
+    int                    lda,
+    const cuDoubleComplex *tau,
+    cuDoubleComplex *      work,
+    int                    lwork,
+    int *                  info);
+
+  /* compute Q**T*b in solve min||A*x = b|| */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSormqr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    int                k,
+    const float *      A,
+    int                lda,
+    const float *      tau,
+    const float *      C,
+    int                ldc,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDormqr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    int                k,
+    const double *     A,
+    int                lda,
+    const double *     tau,
+    const double *     C,
+    int                ldc,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    int                k,
+    const cuComplex *  A,
+    int                lda,
+    const cuComplex *  tau,
+    const cuComplex *  C,
+    int                ldc,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr_bufferSize(
+    cusolverDnHandle_t     handle,
+    cublasSideMode_t       side,
+    cublasOperation_t      trans,
+    int                    m,
+    int                    n,
+    int                    k,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const cuDoubleComplex *tau,
+    const cuDoubleComplex *C,
+    int                    ldc,
+    int *                  lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSormqr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    int                k,
+    const float *      A,
+    int                lda,
+    const float *      tau,
+    float *            C,
+    int                ldc,
+    float *            work,
+    int                lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDormqr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    int                k,
+    const double *     A,
+    int                lda,
+    const double *     tau,
+    double *           C,
+    int                ldc,
+    double *           work,
+    int                lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCunmqr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    int                k,
+    const cuComplex *  A,
+    int                lda,
+    const cuComplex *  tau,
+    cuComplex *        C,
+    int                ldc,
+    cuComplex *        work,
+    int                lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZunmqr(
+    cusolverDnHandle_t     handle,
+    cublasSideMode_t       side,
+    cublasOperation_t      trans,
+    int                    m,
+    int                    n,
+    int                    k,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const cuDoubleComplex *tau,
+    cuDoubleComplex *      C,
+    int                    ldc,
+    cuDoubleComplex *      work,
+    int                    lwork,
+    int *                  devInfo);
+
+  /* L*D*L**T,U*D*U**T factorization */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int                n,
+    float *            A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int                n,
+    double *           A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf_bufferSize(
+    cusolverDnHandle_t handle,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsytrf(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    int *              ipiv,
+    float *            work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsytrf(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    int *              ipiv,
+    double *           work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCsytrf(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    int *              ipiv,
+    cuComplex *        work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZsytrf(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    int *              ipiv,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info);
+
+  /* Symmetric indefinite solve (SYTRS) */
+  cusolverStatus_t CUSOLVERAPI cusolverDnXsytrs_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    int64_t            nrhs,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    const int64_t *    ipiv,
+    cudaDataType       dataTypeB,
+    void *             B,
+    int64_t            ldb,
+    size_t *           workspaceInBytesOnDevice,
+    size_t *           workspaceInBytesOnHost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXsytrs(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    int64_t            nrhs,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    const int64_t *    ipiv,
+    cudaDataType       dataTypeB,
+    void *             B,
+    int64_t            ldb,
+    void *             bufferOnDevice,
+    size_t             workspaceInBytesOnDevice,
+    void *             bufferOnHost,
+    size_t             workspaceInBytesOnHost,
+    int *              info);
+
+  /* Symmetric indefinite inversion (sytri) */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsytri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    const int *        ipiv,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsytri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    const int *        ipiv,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCsytri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    const int *        ipiv,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZsytri_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    const int *        ipiv,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsytri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    const int *        ipiv,
+    float *            work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsytri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    const int *        ipiv,
+    double *           work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCsytri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    const int *        ipiv,
+    cuComplex *        work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZsytri(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    const int *        ipiv,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info);
+
+  /* bidiagonal factorization */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int *              Lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int *              Lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int *              Lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int *              Lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgebrd(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            D,
+    float *            E,
+    float *            TAUQ,
+    float *            TAUP,
+    float *            Work,
+    int                Lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgebrd(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           D,
+    double *           E,
+    double *           TAUQ,
+    double *           TAUP,
+    double *           Work,
+    int                Lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgebrd(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    float *            D,
+    float *            E,
+    cuComplex *        TAUQ,
+    cuComplex *        TAUP,
+    cuComplex *        Work,
+    int                Lwork,
+    int *              devInfo);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgebrd(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    double *           D,
+    double *           E,
+    cuDoubleComplex *  TAUQ,
+    cuDoubleComplex *  TAUP,
+    cuDoubleComplex *  Work,
+    int                Lwork,
+    int *              devInfo);
+
+  /* generates one of the unitary matrices Q or P**T determined by GEBRD*/
+  cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    int                m,
+    int                n,
+    int                k,
+    const float *      A,
+    int                lda,
+    const float *      tau,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    int                m,
+    int                n,
+    int                k,
+    const double *     A,
+    int                lda,
+    const double *     tau,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCungbr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    int                m,
+    int                n,
+    int                k,
+    const cuComplex *  A,
+    int                lda,
+    const cuComplex *  tau,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZungbr_bufferSize(
+    cusolverDnHandle_t     handle,
+    cublasSideMode_t       side,
+    int                    m,
+    int                    n,
+    int                    k,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const cuDoubleComplex *tau,
+    int *                  lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSorgbr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    int                m,
+    int                n,
+    int                k,
+    float *            A,
+    int                lda,
+    const float *      tau,
+    float *            work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDorgbr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    int                m,
+    int                n,
+    int                k,
+    double *           A,
+    int                lda,
+    const double *     tau,
+    double *           work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCungbr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    int                m,
+    int                n,
+    int                k,
+    cuComplex *        A,
+    int                lda,
+    const cuComplex *  tau,
+    cuComplex *        work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZungbr(
+    cusolverDnHandle_t     handle,
+    cublasSideMode_t       side,
+    int                    m,
+    int                    n,
+    int                    k,
+    cuDoubleComplex *      A,
+    int                    lda,
+    const cuDoubleComplex *tau,
+    cuDoubleComplex *      work,
+    int                    lwork,
+    int *                  info);
+
+  /* tridiagonal factorization */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    const float *      A,
+    int                lda,
+    const float *      d,
+    const float *      e,
+    const float *      tau,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    const double *     A,
+    int                lda,
+    const double *     d,
+    const double *     e,
+    const double *     tau,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnChetrd_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    const cuComplex *  A,
+    int                lda,
+    const float *      d,
+    const float *      e,
+    const cuComplex *  tau,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd_bufferSize(
+    cusolverDnHandle_t     handle,
+    cublasFillMode_t       uplo,
+    int                    n,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const double *         d,
+    const double *         e,
+    const cuDoubleComplex *tau,
+    int *                  lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsytrd(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            d,
+    float *            e,
+    float *            tau,
+    float *            work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsytrd(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           d,
+    double *           e,
+    double *           tau,
+    double *           work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnChetrd(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    float *            d,
+    float *            e,
+    cuComplex *        tau,
+    cuComplex *        work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZhetrd(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    double *           d,
+    double *           e,
+    cuDoubleComplex *  tau,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info);
+
+  /* generate unitary Q comes from sytrd */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    const float *      A,
+    int                lda,
+    const float *      tau,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    const double *     A,
+    int                lda,
+    const double *     tau,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCungtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    const cuComplex *  A,
+    int                lda,
+    const cuComplex *  tau,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZungtr_bufferSize(
+    cusolverDnHandle_t     handle,
+    cublasFillMode_t       uplo,
+    int                    n,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const cuDoubleComplex *tau,
+    int *                  lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSorgtr(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    const float *      tau,
+    float *            work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDorgtr(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    const double *     tau,
+    double *           work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCungtr(
+    cusolverDnHandle_t handle,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    const cuComplex *  tau,
+    cuComplex *        work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZungtr(
+    cusolverDnHandle_t     handle,
+    cublasFillMode_t       uplo,
+    int                    n,
+    cuDoubleComplex *      A,
+    int                    lda,
+    const cuDoubleComplex *tau,
+    cuDoubleComplex *      work,
+    int                    lwork,
+    int *                  info);
+
+  /* compute op(Q)*C or C*op(Q) where Q comes from sytrd */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSormtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasFillMode_t   uplo,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    const float *      A,
+    int                lda,
+    const float *      tau,
+    const float *      C,
+    int                ldc,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDormtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasFillMode_t   uplo,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    const double *     A,
+    int                lda,
+    const double *     tau,
+    const double *     C,
+    int                ldc,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCunmtr_bufferSize(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasFillMode_t   uplo,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    const cuComplex *  A,
+    int                lda,
+    const cuComplex *  tau,
+    const cuComplex *  C,
+    int                ldc,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr_bufferSize(
+    cusolverDnHandle_t     handle,
+    cublasSideMode_t       side,
+    cublasFillMode_t       uplo,
+    cublasOperation_t      trans,
+    int                    m,
+    int                    n,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const cuDoubleComplex *tau,
+    const cuDoubleComplex *C,
+    int                    ldc,
+    int *                  lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSormtr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasFillMode_t   uplo,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            tau,
+    float *            C,
+    int                ldc,
+    float *            work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDormtr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasFillMode_t   uplo,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           tau,
+    double *           C,
+    int                ldc,
+    double *           work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCunmtr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasFillMode_t   uplo,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    cuComplex *        tau,
+    cuComplex *        C,
+    int                ldc,
+    cuComplex *        work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZunmtr(
+    cusolverDnHandle_t handle,
+    cublasSideMode_t   side,
+    cublasFillMode_t   uplo,
+    cublasOperation_t  trans,
+    int                m,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    cuDoubleComplex *  tau,
+    cuDoubleComplex *  C,
+    int                ldc,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info);
+
+  /* singular value decomposition, A = U * Sigma * V^H */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgesvd_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgesvd_bufferSize(
+    cusolverDnHandle_t handle,
+    int                m,
+    int                n,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgesvd(
+    cusolverDnHandle_t handle,
+    signed char        jobu,
+    signed char        jobvt,
+    int                m,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            S,
+    float *            U,
+    int                ldu,
+    float *            VT,
+    int                ldvt,
+    float *            work,
+    int                lwork,
+    float *            rwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgesvd(
+    cusolverDnHandle_t handle,
+    signed char        jobu,
+    signed char        jobvt,
+    int                m,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           S,
+    double *           U,
+    int                ldu,
+    double *           VT,
+    int                ldvt,
+    double *           work,
+    int                lwork,
+    double *           rwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgesvd(
+    cusolverDnHandle_t handle,
+    signed char        jobu,
+    signed char        jobvt,
+    int                m,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    float *            S,
+    cuComplex *        U,
+    int                ldu,
+    cuComplex *        VT,
+    int                ldvt,
+    cuComplex *        work,
+    int                lwork,
+    float *            rwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgesvd(
+    cusolverDnHandle_t handle,
+    signed char        jobu,
+    signed char        jobvt,
+    int                m,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    double *           S,
+    cuDoubleComplex *  U,
+    int                ldu,
+    cuDoubleComplex *  VT,
+    int                ldvt,
+    cuDoubleComplex *  work,
+    int                lwork,
+    double *           rwork,
+    int *              info);
+
+  /* standard symmetric eigenvalue solver, A*x = lambda*x, by divide-and-conquer
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const float *      A,
+    int                lda,
+    const float *      W,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const double *     A,
+    int                lda,
+    const double *     W,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCheevd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const cuComplex *  A,
+    int                lda,
+    const float *      W,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZheevd_bufferSize(
+    cusolverDnHandle_t     handle,
+    cusolverEigMode_t      jobz,
+    cublasFillMode_t       uplo,
+    int                    n,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const double *         W,
+    int *                  lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsyevd(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            W,
+    float *            work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsyevd(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           W,
+    double *           work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCheevd(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    float *            W,
+    cuComplex *        work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZheevd(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    double *           W,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info);
+
+  /* standard selective symmetric eigenvalue solver, A*x = lambda*x, by
+   * divide-and-conquer  */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    const float *      A,
+    int                lda,
+    float              vl,
+    float              vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    const float *      W,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    const double *     A,
+    int                lda,
+    double             vl,
+    double             vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    const double *     W,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCheevdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    const cuComplex *  A,
+    int                lda,
+    float              vl,
+    float              vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    const float *      W,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx_bufferSize(
+    cusolverDnHandle_t     handle,
+    cusolverEigMode_t      jobz,
+    cusolverEigRange_t     range,
+    cublasFillMode_t       uplo,
+    int                    n,
+    const cuDoubleComplex *A,
+    int                    lda,
+    double                 vl,
+    double                 vu,
+    int                    il,
+    int                    iu,
+    int *                  meig,
+    const double *         W,
+    int *                  lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsyevdx(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    float              vl,
+    float              vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    float *            W,
+    float *            work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsyevdx(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    double             vl,
+    double             vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    double *           W,
+    double *           work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCheevdx(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    float              vl,
+    float              vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    float *            W,
+    cuComplex *        work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZheevdx(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    double             vl,
+    double             vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    double *           W,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info);
+
+  /* selective generalized symmetric eigenvalue solver, A*x = lambda*B*x, by
+   * divide-and-conquer  */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    const float *      A,
+    int                lda,
+    const float *      B,
+    int                ldb,
+    float              vl,
+    float              vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    const float *      W,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    const double *     A,
+    int                lda,
+    const double *     B,
+    int                ldb,
+    double             vl,
+    double             vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    const double *     W,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    const cuComplex *  A,
+    int                lda,
+    const cuComplex *  B,
+    int                ldb,
+    float              vl,
+    float              vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    const float *      W,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx_bufferSize(
+    cusolverDnHandle_t     handle,
+    cusolverEigType_t      itype,
+    cusolverEigMode_t      jobz,
+    cusolverEigRange_t     range,
+    cublasFillMode_t       uplo,
+    int                    n,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const cuDoubleComplex *B,
+    int                    ldb,
+    double                 vl,
+    double                 vu,
+    int                    il,
+    int                    iu,
+    int *                  meig,
+    const double *         W,
+    int *                  lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsygvdx(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            B,
+    int                ldb,
+    float              vl,
+    float              vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    float *            W,
+    float *            work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsygvdx(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           B,
+    int                ldb,
+    double             vl,
+    double             vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    double *           W,
+    double *           work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnChegvdx(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    cuComplex *        B,
+    int                ldb,
+    float              vl,
+    float              vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    float *            W,
+    cuComplex *        work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZhegvdx(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    cuDoubleComplex *  B,
+    int                ldb,
+    double             vl,
+    double             vu,
+    int                il,
+    int                iu,
+    int *              meig,
+    double *           W,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info);
+
+  /* generalized symmetric eigenvalue solver, A*x = lambda*B*x, by
+   * divide-and-conquer  */
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const float *      A,
+    int                lda,
+    const float *      B,
+    int                ldb,
+    const float *      W,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const double *     A,
+    int                lda,
+    const double *     B,
+    int                ldb,
+    const double *     W,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnChegvd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const cuComplex *  A,
+    int                lda,
+    const cuComplex *  B,
+    int                ldb,
+    const float *      W,
+    int *              lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZhegvd_bufferSize(
+    cusolverDnHandle_t     handle,
+    cusolverEigType_t      itype,
+    cusolverEigMode_t      jobz,
+    cublasFillMode_t       uplo,
+    int                    n,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const cuDoubleComplex *B,
+    int                    ldb,
+    const double *         W,
+    int *                  lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsygvd(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            B,
+    int                ldb,
+    float *            W,
+    float *            work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsygvd(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           B,
+    int                ldb,
+    double *           W,
+    double *           work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnChegvd(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    cuComplex *        B,
+    int                ldb,
+    float *            W,
+    cuComplex *        work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZhegvd(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    cuDoubleComplex *  B,
+    int                ldb,
+    double *           W,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCreateSyevjInfo(syevjInfo_t *info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDestroySyevjInfo(syevjInfo_t info);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnXsyevjSetTolerance(syevjInfo_t info, double tolerance);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnXsyevjSetMaxSweeps(syevjInfo_t info, int max_sweeps);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnXsyevjSetSortEig(syevjInfo_t info, int sort_eig);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetResidual(
+    cusolverDnHandle_t handle,
+    syevjInfo_t        info,
+    double *           residual);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXsyevjGetSweeps(
+    cusolverDnHandle_t handle,
+    syevjInfo_t        info,
+    int *              executed_sweeps);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const float *      A,
+    int                lda,
+    const float *      W,
+    int *              lwork,
+    syevjInfo_t        params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const double *     A,
+    int                lda,
+    const double *     W,
+    int *              lwork,
+    syevjInfo_t        params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const cuComplex *  A,
+    int                lda,
+    const float *      W,
+    int *              lwork,
+    syevjInfo_t        params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched_bufferSize(
+    cusolverDnHandle_t     handle,
+    cusolverEigMode_t      jobz,
+    cublasFillMode_t       uplo,
+    int                    n,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const double *         W,
+    int *                  lwork,
+    syevjInfo_t            params,
+    int                    batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsyevjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            W,
+    float *            work,
+    int                lwork,
+    int *              info,
+    syevjInfo_t        params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsyevjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           W,
+    double *           work,
+    int                lwork,
+    int *              info,
+    syevjInfo_t        params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCheevjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    float *            W,
+    cuComplex *        work,
+    int                lwork,
+    int *              info,
+    syevjInfo_t        params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZheevjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    double *           W,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info,
+    syevjInfo_t        params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const float *      A,
+    int                lda,
+    const float *      W,
+    int *              lwork,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const double *     A,
+    int                lda,
+    const double *     W,
+    int *              lwork,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCheevj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const cuComplex *  A,
+    int                lda,
+    const float *      W,
+    int *              lwork,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZheevj_bufferSize(
+    cusolverDnHandle_t     handle,
+    cusolverEigMode_t      jobz,
+    cublasFillMode_t       uplo,
+    int                    n,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const double *         W,
+    int *                  lwork,
+    syevjInfo_t            params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsyevj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            W,
+    float *            work,
+    int                lwork,
+    int *              info,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsyevj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           W,
+    double *           work,
+    int                lwork,
+    int *              info,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCheevj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    float *            W,
+    cuComplex *        work,
+    int                lwork,
+    int *              info,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZheevj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    double *           W,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const float *      A,
+    int                lda,
+    const float *      B,
+    int                ldb,
+    const float *      W,
+    int *              lwork,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const double *     A,
+    int                lda,
+    const double *     B,
+    int                ldb,
+    const double *     W,
+    int *              lwork,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnChegvj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    const cuComplex *  A,
+    int                lda,
+    const cuComplex *  B,
+    int                ldb,
+    const float *      W,
+    int *              lwork,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj_bufferSize(
+    cusolverDnHandle_t     handle,
+    cusolverEigType_t      itype,
+    cusolverEigMode_t      jobz,
+    cublasFillMode_t       uplo,
+    int                    n,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const cuDoubleComplex *B,
+    int                    ldb,
+    const double *         W,
+    int *                  lwork,
+    syevjInfo_t            params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSsygvj(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            B,
+    int                ldb,
+    float *            W,
+    float *            work,
+    int                lwork,
+    int *              info,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDsygvj(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           B,
+    int                ldb,
+    double *           W,
+    double *           work,
+    int                lwork,
+    int *              info,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnChegvj(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    cuComplex *        B,
+    int                ldb,
+    float *            W,
+    cuComplex *        work,
+    int                lwork,
+    int *              info,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZhegvj(
+    cusolverDnHandle_t handle,
+    cusolverEigType_t  itype,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    cuDoubleComplex *  B,
+    int                ldb,
+    double *           W,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info,
+    syevjInfo_t        params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCreateGesvdjInfo(gesvdjInfo_t *info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDestroyGesvdjInfo(gesvdjInfo_t info);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnXgesvdjSetTolerance(gesvdjInfo_t info, double tolerance);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnXgesvdjSetMaxSweeps(gesvdjInfo_t info, int max_sweeps);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnXgesvdjSetSortEig(gesvdjInfo_t info, int sort_svd);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetResidual(
+    cusolverDnHandle_t handle,
+    gesvdjInfo_t       info,
+    double *           residual);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdjGetSweeps(
+    cusolverDnHandle_t handle,
+    gesvdjInfo_t       info,
+    int *              executed_sweeps);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                m,
+    int                n,
+    const float *      A,
+    int                lda,
+    const float *      S,
+    const float *      U,
+    int                ldu,
+    const float *      V,
+    int                ldv,
+    int *              lwork,
+    gesvdjInfo_t       params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                m,
+    int                n,
+    const double *     A,
+    int                lda,
+    const double *     S,
+    const double *     U,
+    int                ldu,
+    const double *     V,
+    int                ldv,
+    int *              lwork,
+    gesvdjInfo_t       params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                m,
+    int                n,
+    const cuComplex *  A,
+    int                lda,
+    const float *      S,
+    const cuComplex *  U,
+    int                ldu,
+    const cuComplex *  V,
+    int                ldv,
+    int *              lwork,
+    gesvdjInfo_t       params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched_bufferSize(
+    cusolverDnHandle_t     handle,
+    cusolverEigMode_t      jobz,
+    int                    m,
+    int                    n,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const double *         S,
+    const cuDoubleComplex *U,
+    int                    ldu,
+    const cuDoubleComplex *V,
+    int                    ldv,
+    int *                  lwork,
+    gesvdjInfo_t           params,
+    int                    batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                m,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            S,
+    float *            U,
+    int                ldu,
+    float *            V,
+    int                ldv,
+    float *            work,
+    int                lwork,
+    int *              info,
+    gesvdjInfo_t       params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                m,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           S,
+    double *           U,
+    int                ldu,
+    double *           V,
+    int                ldv,
+    double *           work,
+    int                lwork,
+    int *              info,
+    gesvdjInfo_t       params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                m,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    float *            S,
+    cuComplex *        U,
+    int                ldu,
+    cuComplex *        V,
+    int                ldv,
+    cuComplex *        work,
+    int                lwork,
+    int *              info,
+    gesvdjInfo_t       params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdjBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                m,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    double *           S,
+    cuDoubleComplex *  U,
+    int                ldu,
+    cuDoubleComplex *  V,
+    int                ldv,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info,
+    gesvdjInfo_t       params,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                econ,
+    int                m,
+    int                n,
+    const float *      A,
+    int                lda,
+    const float *      S,
+    const float *      U,
+    int                ldu,
+    const float *      V,
+    int                ldv,
+    int *              lwork,
+    gesvdjInfo_t       params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                econ,
+    int                m,
+    int                n,
+    const double *     A,
+    int                lda,
+    const double *     S,
+    const double *     U,
+    int                ldu,
+    const double *     V,
+    int                ldv,
+    int *              lwork,
+    gesvdjInfo_t       params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                econ,
+    int                m,
+    int                n,
+    const cuComplex *  A,
+    int                lda,
+    const float *      S,
+    const cuComplex *  U,
+    int                ldu,
+    const cuComplex *  V,
+    int                ldv,
+    int *              lwork,
+    gesvdjInfo_t       params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj_bufferSize(
+    cusolverDnHandle_t     handle,
+    cusolverEigMode_t      jobz,
+    int                    econ,
+    int                    m,
+    int                    n,
+    const cuDoubleComplex *A,
+    int                    lda,
+    const double *         S,
+    const cuDoubleComplex *U,
+    int                    ldu,
+    const cuDoubleComplex *V,
+    int                    ldv,
+    int *                  lwork,
+    gesvdjInfo_t           params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                econ,
+    int                m,
+    int                n,
+    float *            A,
+    int                lda,
+    float *            S,
+    float *            U,
+    int                ldu,
+    float *            V,
+    int                ldv,
+    float *            work,
+    int                lwork,
+    int *              info,
+    gesvdjInfo_t       params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                econ,
+    int                m,
+    int                n,
+    double *           A,
+    int                lda,
+    double *           S,
+    double *           U,
+    int                ldu,
+    double *           V,
+    int                ldv,
+    double *           work,
+    int                lwork,
+    int *              info,
+    gesvdjInfo_t       params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                econ,
+    int                m,
+    int                n,
+    cuComplex *        A,
+    int                lda,
+    float *            S,
+    cuComplex *        U,
+    int                ldu,
+    cuComplex *        V,
+    int                ldv,
+    cuComplex *        work,
+    int                lwork,
+    int *              info,
+    gesvdjInfo_t       params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdj(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                econ,
+    int                m,
+    int                n,
+    cuDoubleComplex *  A,
+    int                lda,
+    double *           S,
+    cuDoubleComplex *  U,
+    int                ldu,
+    cuDoubleComplex *  V,
+    int                ldv,
+    cuDoubleComplex *  work,
+    int                lwork,
+    int *              info,
+    gesvdjInfo_t       params);
+
+  /* batched approximate SVD */
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                rank,
+    int                m,
+    int                n,
+    const float *      d_A,
+    int                lda,
+    long long int      strideA,
+    const float *      d_S,
+    long long int      strideS,
+    const float *      d_U,
+    int                ldu,
+    long long int      strideU,
+    const float *      d_V,
+    int                ldv,
+    long long int      strideV,
+    int *              lwork,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                rank,
+    int                m,
+    int                n,
+    const double *     d_A,
+    int                lda,
+    long long int      strideA,
+    const double *     d_S,
+    long long int      strideS,
+    const double *     d_U,
+    int                ldu,
+    long long int      strideU,
+    const double *     d_V,
+    int                ldv,
+    long long int      strideV,
+    int *              lwork,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                rank,
+    int                m,
+    int                n,
+    const cuComplex *  d_A,
+    int                lda,
+    long long int      strideA,
+    const float *      d_S,
+    long long int      strideS,
+    const cuComplex *  d_U,
+    int                ldu,
+    long long int      strideU,
+    const cuComplex *  d_V,
+    int                ldv,
+    long long int      strideV,
+    int *              lwork,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched_bufferSize(
+    cusolverDnHandle_t     handle,
+    cusolverEigMode_t      jobz,
+    int                    rank,
+    int                    m,
+    int                    n,
+    const cuDoubleComplex *d_A,
+    int                    lda,
+    long long int          strideA,
+    const double *         d_S,
+    long long int          strideS,
+    const cuDoubleComplex *d_U,
+    int                    ldu,
+    long long int          strideU,
+    const cuDoubleComplex *d_V,
+    int                    ldv,
+    long long int          strideV,
+    int *                  lwork,
+    int                    batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSgesvdaStridedBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                rank,
+    int                m,
+    int                n,
+    const float *      d_A,
+    int                lda,
+    long long int      strideA,
+    float *            d_S,
+    long long int      strideS,
+    float *            d_U,
+    int                ldu,
+    long long int      strideU,
+    float *            d_V,
+    int                ldv,
+    long long int      strideV,
+    float *            d_work,
+    int                lwork,
+    int *              d_info,
+    double *           h_R_nrmF,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnDgesvdaStridedBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                rank,
+    int                m,
+    int                n,
+    const double *     d_A,
+    int                lda,
+    long long int      strideA,
+    double *           d_S,
+    long long int      strideS,
+    double *           d_U,
+    int                ldu,
+    long long int      strideU,
+    double *           d_V,
+    int                ldv,
+    long long int      strideV,
+    double *           d_work,
+    int                lwork,
+    int *              d_info,
+    double *           h_R_nrmF,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnCgesvdaStridedBatched(
+    cusolverDnHandle_t handle,
+    cusolverEigMode_t  jobz,
+    int                rank,
+    int                m,
+    int                n,
+    const cuComplex *  d_A,
+    int                lda,
+    long long int      strideA,
+    float *            d_S,
+    long long int      strideS,
+    cuComplex *        d_U,
+    int                ldu,
+    long long int      strideU,
+    cuComplex *        d_V,
+    int                ldv,
+    long long int      strideV,
+    cuComplex *        d_work,
+    int                lwork,
+    int *              d_info,
+    double *           h_R_nrmF,
+    int                batchSize);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnZgesvdaStridedBatched(
+    cusolverDnHandle_t     handle,
+    cusolverEigMode_t      jobz,
+    int                    rank,
+    int                    m,
+    int                    n,
+    const cuDoubleComplex *d_A,
+    int                    lda,
+    long long int          strideA,
+    double *               d_S,
+    long long int          strideS,
+    cuDoubleComplex *      d_U,
+    int                    ldu,
+    long long int          strideU,
+    cuDoubleComplex *      d_V,
+    int                    ldv,
+    long long int          strideV,
+    cuDoubleComplex *      d_work,
+    int                    lwork,
+    int *                  d_info,
+    double *               h_R_nrmF,
+    int                    batchSize);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnCreateParams(cusolverDnParams_t *params);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnDestroyParams(cusolverDnParams_t params);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnSetAdvOptions(
+    cusolverDnParams_t   params,
+    cusolverDnFunction_t function,
+    cusolverAlgMode_t    algo);
+
+  /* 64-bit API for POTRF */
+  CUSOLVER_DEPRECATED(cusolverDnXpotrf_bufferSize)
+  cusolverStatus_t CUSOLVERAPI cusolverDnPotrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytes);
+
+  CUSOLVER_DEPRECATED(cusolverDnXpotrf)
+  cusolverStatus_t CUSOLVERAPI cusolverDnPotrf(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    cudaDataType       computeType,
+    void *             pBuffer,
+    size_t             workspaceInBytes,
+    int *              info);
+
+  /* 64-bit API for POTRS */
+  CUSOLVER_DEPRECATED(cusolverDnXpotrs)
+  cusolverStatus_t CUSOLVERAPI cusolverDnPotrs(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    int64_t            nrhs,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       dataTypeB,
+    void *             B,
+    int64_t            ldb,
+    int *              info);
+
+  /* 64-bit API for GEQRF */
+  CUSOLVER_DEPRECATED(cusolverDnXgeqrf_bufferSize)
+  cusolverStatus_t CUSOLVERAPI cusolverDnGeqrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       dataTypeTau,
+    const void *       tau,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytes);
+
+  CUSOLVER_DEPRECATED(cusolverDnXgeqrf)
+  cusolverStatus_t CUSOLVERAPI cusolverDnGeqrf(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    cudaDataType       dataTypeTau,
+    void *             tau,
+    cudaDataType       computeType,
+    void *             pBuffer,
+    size_t             workspaceInBytes,
+    int *              info);
+
+  /* 64-bit API for GETRF */
+  CUSOLVER_DEPRECATED(cusolverDnXgetrf_bufferSize)
+  cusolverStatus_t CUSOLVERAPI cusolverDnGetrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytes);
+
+  CUSOLVER_DEPRECATED(cusolverDnXgetrf)
+  cusolverStatus_t CUSOLVERAPI cusolverDnGetrf(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    int64_t *          ipiv,
+    cudaDataType       computeType,
+    void *             pBuffer,
+    size_t             workspaceInBytes,
+    int *              info);
+
+  /* 64-bit API for GETRS */
+  CUSOLVER_DEPRECATED(cusolverDnXgetrs)
+  cusolverStatus_t CUSOLVERAPI cusolverDnGetrs(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cublasOperation_t  trans,
+    int64_t            n,
+    int64_t            nrhs,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    const int64_t *    ipiv,
+    cudaDataType       dataTypeB,
+    void *             B,
+    int64_t            ldb,
+    int *              info);
+
+  /* 64-bit API for SYEVD */
+  CUSOLVER_DEPRECATED(cusolverDnXsyevd_bufferSize)
+  cusolverStatus_t CUSOLVERAPI cusolverDnSyevd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       dataTypeW,
+    const void *       W,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytes);
+
+  CUSOLVER_DEPRECATED(cusolverDnXsyevd)
+  cusolverStatus_t CUSOLVERAPI cusolverDnSyevd(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    cudaDataType       dataTypeW,
+    void *             W,
+    cudaDataType       computeType,
+    void *             pBuffer,
+    size_t             workspaceInBytes,
+    int *              info);
+
+  /* 64-bit API for SYEVDX */
+  CUSOLVER_DEPRECATED(cusolverDnXsyevdx_bufferSize)
+  cusolverStatus_t CUSOLVERAPI cusolverDnSyevdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    void *             vl,
+    void *             vu,
+    int64_t            il,
+    int64_t            iu,
+    int64_t *          h_meig,
+    cudaDataType       dataTypeW,
+    const void *       W,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytes);
+
+  CUSOLVER_DEPRECATED(cusolverDnXsyevdx)
+  cusolverStatus_t CUSOLVERAPI cusolverDnSyevdx(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    void *             vl,
+    void *             vu,
+    int64_t            il,
+    int64_t            iu,
+    int64_t *          meig64,
+    cudaDataType       dataTypeW,
+    void *             W,
+    cudaDataType       computeType,
+    void *             pBuffer,
+    size_t             workspaceInBytes,
+    int *              info);
+
+  /* 64-bit API for GESVD */
+  CUSOLVER_DEPRECATED(cusolverDnXgesvd_bufferSize)
+  cusolverStatus_t CUSOLVERAPI cusolverDnGesvd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    signed char        jobu,
+    signed char        jobvt,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       dataTypeS,
+    const void *       S,
+    cudaDataType       dataTypeU,
+    const void *       U,
+    int64_t            ldu,
+    cudaDataType       dataTypeVT,
+    const void *       VT,
+    int64_t            ldvt,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytes);
+
+  CUSOLVER_DEPRECATED(cusolverDnXgesvd)
+  cusolverStatus_t CUSOLVERAPI cusolverDnGesvd(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    signed char        jobu,
+    signed char        jobvt,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    cudaDataType       dataTypeS,
+    void *             S,
+    cudaDataType       dataTypeU,
+    void *             U,
+    int64_t            ldu,
+    cudaDataType       dataTypeVT,
+    void *             VT,
+    int64_t            ldvt,
+    cudaDataType       computeType,
+    void *             pBuffer,
+    size_t             workspaceInBytes,
+    int *              info);
+
+  /*
+   * new 64-bit API
+   */
+  /* 64-bit API for POTRF */
+  cusolverStatus_t CUSOLVERAPI cusolverDnXpotrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytesOnDevice,
+    size_t *           workspaceInBytesOnHost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXpotrf(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    cudaDataType       computeType,
+    void *             bufferOnDevice,
+    size_t             workspaceInBytesOnDevice,
+    void *             bufferOnHost,
+    size_t             workspaceInBytesOnHost,
+    int *              info);
+
+  /* 64-bit API for POTRS */
+  cusolverStatus_t CUSOLVERAPI cusolverDnXpotrs(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    int64_t            nrhs,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       dataTypeB,
+    void *             B,
+    int64_t            ldb,
+    int *              info);
+
+  /* 64-bit API for GEQRF */
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgeqrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       dataTypeTau,
+    const void *       tau,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytesOnDevice,
+    size_t *           workspaceInBytesOnHost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgeqrf(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    cudaDataType       dataTypeTau,
+    void *             tau,
+    cudaDataType       computeType,
+    void *             bufferOnDevice,
+    size_t             workspaceInBytesOnDevice,
+    void *             bufferOnHost,
+    size_t             workspaceInBytesOnHost,
+    int *              info);
+
+  /* 64-bit API for GETRF */
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgetrf_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytesOnDevice,
+    size_t *           workspaceInBytesOnHost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgetrf(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    int64_t *          ipiv,
+    cudaDataType       computeType,
+    void *             bufferOnDevice,
+    size_t             workspaceInBytesOnDevice,
+    void *             bufferOnHost,
+    size_t             workspaceInBytesOnHost,
+    int *              info);
+
+  /* 64-bit API for GETRS */
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgetrs(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cublasOperation_t  trans,
+    int64_t            n,
+    int64_t            nrhs,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    const int64_t *    ipiv,
+    cudaDataType       dataTypeB,
+    void *             B,
+    int64_t            ldb,
+    int *              info);
+
+  /* 64-bit API for SYEVD */
+  cusolverStatus_t CUSOLVERAPI cusolverDnXsyevd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       dataTypeW,
+    const void *       W,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytesOnDevice,
+    size_t *           workspaceInBytesOnHost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXsyevd(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    cudaDataType       dataTypeW,
+    void *             W,
+    cudaDataType       computeType,
+    void *             bufferOnDevice,
+    size_t             workspaceInBytesOnDevice,
+    void *             bufferOnHost,
+    size_t             workspaceInBytesOnHost,
+    int *              info);
+
+  /* 64-bit API for SYEVDX */
+  cusolverStatus_t CUSOLVERAPI cusolverDnXsyevdx_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    void *             vl,
+    void *             vu,
+    int64_t            il,
+    int64_t            iu,
+    int64_t *          h_meig,
+    cudaDataType       dataTypeW,
+    const void *       W,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytesOnDevice,
+    size_t *           workspaceInBytesOnHost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXsyevdx(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    cusolverEigRange_t range,
+    cublasFillMode_t   uplo,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    void *             vl,
+    void *             vu,
+    int64_t            il,
+    int64_t            iu,
+    int64_t *          meig64,
+    cudaDataType       dataTypeW,
+    void *             W,
+    cudaDataType       computeType,
+    void *             bufferOnDevice,
+    size_t             workspaceInBytesOnDevice,
+    void *             bufferOnHost,
+    size_t             workspaceInBytesOnHost,
+    int *              info);
+
+  /* 64-bit API for GESVD */
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgesvd_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    signed char        jobu,
+    signed char        jobvt,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       dataTypeS,
+    const void *       S,
+    cudaDataType       dataTypeU,
+    const void *       U,
+    int64_t            ldu,
+    cudaDataType       dataTypeVT,
+    const void *       VT,
+    int64_t            ldvt,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytesOnDevice,
+    size_t *           workspaceInBytesOnHost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgesvd(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    signed char        jobu,
+    signed char        jobvt,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    cudaDataType       dataTypeS,
+    void *             S,
+    cudaDataType       dataTypeU,
+    void *             U,
+    int64_t            ldu,
+    cudaDataType       dataTypeVT,
+    void *             VT,
+    int64_t            ldvt,
+    cudaDataType       computeType,
+    void *             bufferOnDevice,
+    size_t             workspaceInBytesOnDevice,
+    void *             bufferOnHost,
+    size_t             workspaceInBytesOnHost,
+    int *              info);
+
+  /* 64-bit API for GESVDP */
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdp_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    int                econ,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       dataTypeS,
+    const void *       S,
+    cudaDataType       dataTypeU,
+    const void *       U,
+    int64_t            ldu,
+    cudaDataType       dataTypeV,
+    const void *       V,
+    int64_t            ldv,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytesOnDevice,
+    size_t *           workspaceInBytesOnHost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdp(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    cusolverEigMode_t  jobz,
+    int                econ,
+    int64_t            m,
+    int64_t            n,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    cudaDataType       dataTypeS,
+    void *             S,
+    cudaDataType       dataTypeU,
+    void *             U,
+    int64_t            ldu,
+    cudaDataType       dataTypeV,
+    void *             V,
+    int64_t            ldv,
+    cudaDataType       computeType,
+    void *             bufferOnDevice,
+    size_t             workspaceInBytesOnDevice,
+    void *             bufferOnHost,
+    size_t             workspaceInBytesOnHost,
+    int *              d_info,
+    double *           h_err_sigma);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdr_bufferSize(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    signed char        jobu,
+    signed char        jobv,
+    int64_t            m,
+    int64_t            n,
+    int64_t            k,
+    int64_t            p,
+    int64_t            niters,
+    cudaDataType       dataTypeA,
+    const void *       A,
+    int64_t            lda,
+    cudaDataType       dataTypeSrand,
+    const void *       Srand,
+    cudaDataType       dataTypeUrand,
+    const void *       Urand,
+    int64_t            ldUrand,
+    cudaDataType       dataTypeVrand,
+    const void *       Vrand,
+    int64_t            ldVrand,
+    cudaDataType       computeType,
+    size_t *           workspaceInBytesOnDevice,
+    size_t *           workspaceInBytesOnHost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXgesvdr(
+    cusolverDnHandle_t handle,
+    cusolverDnParams_t params,
+    signed char        jobu,
+    signed char        jobv,
+    int64_t            m,
+    int64_t            n,
+    int64_t            k,
+    int64_t            p,
+    int64_t            niters,
+    cudaDataType       dataTypeA,
+    void *             A,
+    int64_t            lda,
+    cudaDataType       dataTypeSrand,
+    void *             Srand,
+    cudaDataType       dataTypeUrand,
+    void *             Urand,
+    int64_t            ldUrand,
+    cudaDataType       dataTypeVrand,
+    void *             Vrand,
+    int64_t            ldVrand,
+    cudaDataType       computeType,
+    void *             bufferOnDevice,
+    size_t             workspaceInBytesOnDevice,
+    void *             bufferOnHost,
+    size_t             workspaceInBytesOnHost,
+    int *              d_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXlarft_bufferSize(
+    cusolverDnHandle_t     handle,
+    cusolverDnParams_t     params,
+    cusolverDirectMode_t   direct,
+    cusolverStorevMode_t   storev,
+    int64_t                N,
+    int64_t                K,
+    cudaDataType           dataTypeV,
+    const void            *d_V,
+    int64_t                ldv,
+    cudaDataType           dataTypeTau,
+    const void            *d_tau,
+    cudaDataType           dataTypeT,
+    void                  *d_T,
+    int64_t                ldt,
+    cudaDataType           computeType,
+    size_t                *workspaceInBytesOnDevice,
+    size_t                *workspaceInBytesOnHost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnXlarft(
+    cusolverDnHandle_t     handle,
+    cusolverDnParams_t     params,
+    cusolverDirectMode_t   direct,
+    cusolverStorevMode_t   storev,
+    int64_t                N,
+    int64_t                K,
+    cudaDataType           dataTypeV,
+    const void            *d_V,
+    int64_t                ldv,
+    cudaDataType           dataTypeTau,
+    const void            *d_tau,
+    cudaDataType           dataTypeT,
+    void                  *d_T,
+    int64_t                ldt,
+    cudaDataType           computeType,
+    void                  *bufferOnDevice,
+    size_t                 workspaceInBytesOnDevice,
+    void                  *bufferOnHost,
+    size_t                 workspaceInBytesOnHost);
+
+  typedef void (*cusolverDnLoggerCallback_t)(
+    int         logLevel,
+    const char *functionName,
+    const char *message);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverDnLoggerSetCallback(cusolverDnLoggerCallback_t callback);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnLoggerSetFile(FILE *file);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnLoggerOpenFile(const char *logFile);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnLoggerSetLevel(int level);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnLoggerSetMask(int mask);
+
+  cusolverStatus_t CUSOLVERAPI cusolverDnLoggerForceDisable();
+
+  #if defined(__cplusplus)
+}
+  #endif /* __cplusplus */
+
+#endif /* !defined(CUDENSE_H_) */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverMg.h b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverMg.h
new file mode 100644
index 0000000000000000000000000000000000000000..7702191f7253d66cf998016f6ae9f14149fbbb0b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverMg.h
@@ -0,0 +1,318 @@
+/*
+ * Copyright 2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CUSOLVERMG_H_)
+  #define CUSOLVERMG_H_
+
+  #include <stdint.h>
+  #include "cusolverDn.h"
+
+  #if defined(__cplusplus)
+extern "C" {
+  #endif /* __cplusplus */
+
+  struct cusolverMgContext;
+  typedef struct cusolverMgContext *cusolverMgHandle_t;
+
+  /**
+   * \beief This enum decides how 1D device Ids (or process ranks) get mapped to
+   * a 2D grid.
+   */
+  typedef enum {
+
+    CUDALIBMG_GRID_MAPPING_ROW_MAJOR = 1,
+    CUDALIBMG_GRID_MAPPING_COL_MAJOR = 0
+
+  } cusolverMgGridMapping_t;
+
+  /** \brief Opaque structure of the distributed grid */
+  typedef void *cudaLibMgGrid_t;
+  /** \brief Opaque structure of the distributed matrix descriptor */
+  typedef void *cudaLibMgMatrixDesc_t;
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgCreate(cusolverMgHandle_t *handle);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgDestroy(cusolverMgHandle_t handle);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgDeviceSelect(
+    cusolverMgHandle_t handle,
+    int                nbDevices,
+    int                deviceId[]);
+
+  /**
+   * \brief Allocates resources related to the shared memory device grid.
+   * \param[out] grid the opaque data strcuture that holds the grid
+   * \param[in] numRowDevices number of devices in the row
+   * \param[in] numColDevices number of devices in the column
+   * \param[in] deviceId This array of size height * width stores the
+   *            device-ids of the 2D grid; each entry must correspond to a valid
+   * gpu or to -1 (denoting CPU). \param[in] mapping whether the 2D grid is in
+   * row/column major \returns the status code
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverMgCreateDeviceGrid(
+    cudaLibMgGrid_t *       grid,
+    int32_t                 numRowDevices,
+    int32_t                 numColDevices,
+    const int32_t           deviceId[],
+    cusolverMgGridMapping_t mapping);
+
+  /**
+   * \brief Releases the allocated resources related to the distributed grid.
+   * \param[in] grid the opaque data strcuture that holds the distributed grid
+   * \returns the status code
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverMgDestroyGrid(cudaLibMgGrid_t grid);
+
+  /**
+   * \brief Allocates resources related to the distributed matrix descriptor.
+   * \param[out] desc the opaque data strcuture that holds the descriptor
+   * \param[in] numRows number of total rows
+   * \param[in] numCols number of total columns
+   * \param[in] rowBlockSize row block size
+   * \param[in] colBlockSize column block size
+   * \param[in] dataType the data type of each element in cudaDataType
+   * \param[in] grid the opaque data structure of the distributed grid
+   * \returns the status code
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverMgCreateMatrixDesc(
+    cudaLibMgMatrixDesc_t *desc,
+    int64_t                numRows,
+    int64_t                numCols,
+    int64_t                rowBlockSize,
+    int64_t                colBlockSize,
+    cudaDataType           dataType,
+    const cudaLibMgGrid_t  grid);
+
+  /**
+   * \brief Releases the allocated resources related to the distributed matrix
+   * descriptor. \param[in] desc the opaque data strcuture that holds the
+   * descriptor \returns the status code
+   */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverMgDestroyMatrixDesc(cudaLibMgMatrixDesc_t desc);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgSyevd_bufferSize(
+    cusolverMgHandle_t    handle,
+    cusolverEigMode_t     jobz,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    void *                W,
+    cudaDataType          dataTypeW,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgSyevd(
+    cusolverMgHandle_t    handle,
+    cusolverEigMode_t     jobz,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    void *                W,
+    cudaDataType          dataTypeW,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgGetrf_bufferSize(
+    cusolverMgHandle_t    handle,
+    int                   M,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    int *                 array_d_IPIV[],
+    cudaDataType          computeType,
+    int64_t *             lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgGetrf(
+    cusolverMgHandle_t    handle,
+    int                   M,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    int *                 array_d_IPIV[],
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgGetrs_bufferSize(
+    cusolverMgHandle_t    handle,
+    cublasOperation_t     TRANS,
+    int                   N,
+    int                   NRHS,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    int *                 array_d_IPIV[],
+    void *                array_d_B[],
+    int                   IB,
+    int                   JB,
+    cudaLibMgMatrixDesc_t descrB,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgGetrs(
+    cusolverMgHandle_t    handle,
+    cublasOperation_t     TRANS,
+    int                   N,
+    int                   NRHS,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    int *                 array_d_IPIV[],
+    void *                array_d_B[],
+    int                   IB,
+    int                   JB,
+    cudaLibMgMatrixDesc_t descrB,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotrf_bufferSize(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotrf(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 h_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotrs_bufferSize(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   n,
+    int                   nrhs,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    void *                array_d_B[],
+    int                   IB,
+    int                   JB,
+    cudaLibMgMatrixDesc_t descrB,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotrs(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   n,
+    int                   nrhs,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    void *                array_d_B[],
+    int                   IB,
+    int                   JB,
+    cudaLibMgMatrixDesc_t descrB,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 h_info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotri_bufferSize(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotri(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 h_info);
+
+  #if defined(__cplusplus)
+}
+  #endif /* __cplusplus */
+
+#endif // CUSOLVERMG_H_
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverRf.h b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverRf.h
new file mode 100644
index 0000000000000000000000000000000000000000..c74e9ca6bb34a8d1214c10450c45e2599417636f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverRf.h
@@ -0,0 +1,339 @@
+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CUSOLVERRF_H_)
+  #define CUSOLVERRF_H_
+
+  #include "driver_types.h"
+  #include "cuComplex.h"
+  #include "cusolver_common.h"
+
+  #if defined(__cplusplus)
+extern "C" {
+  #endif /* __cplusplus */
+
+  /* CUSOLVERRF mode */
+  typedef enum {
+    CUSOLVERRF_RESET_VALUES_FAST_MODE_OFF = 0, // default
+    CUSOLVERRF_RESET_VALUES_FAST_MODE_ON = 1
+  } cusolverRfResetValuesFastMode_t;
+
+  /* CUSOLVERRF matrix format */
+  typedef enum {
+    CUSOLVERRF_MATRIX_FORMAT_CSR = 0, // default
+    CUSOLVERRF_MATRIX_FORMAT_CSC = 1
+  } cusolverRfMatrixFormat_t;
+
+  /* CUSOLVERRF unit diagonal */
+  typedef enum {
+    CUSOLVERRF_UNIT_DIAGONAL_STORED_L = 0, // default
+    CUSOLVERRF_UNIT_DIAGONAL_STORED_U = 1,
+    CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_L = 2,
+    CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_U = 3
+  } cusolverRfUnitDiagonal_t;
+
+  /* CUSOLVERRF factorization algorithm */
+  typedef enum {
+    CUSOLVERRF_FACTORIZATION_ALG0 = 0, // default
+    CUSOLVERRF_FACTORIZATION_ALG1 = 1,
+    CUSOLVERRF_FACTORIZATION_ALG2 = 2,
+  } cusolverRfFactorization_t;
+
+  /* CUSOLVERRF triangular solve algorithm */
+  typedef enum {
+    CUSOLVERRF_TRIANGULAR_SOLVE_ALG1 = 1, // default
+    CUSOLVERRF_TRIANGULAR_SOLVE_ALG2 = 2,
+    CUSOLVERRF_TRIANGULAR_SOLVE_ALG3 = 3
+  } cusolverRfTriangularSolve_t;
+
+  /* CUSOLVERRF numeric boost report */
+  typedef enum {
+    CUSOLVERRF_NUMERIC_BOOST_NOT_USED = 0, // default
+    CUSOLVERRF_NUMERIC_BOOST_USED = 1
+  } cusolverRfNumericBoostReport_t;
+
+  /* Opaque structure holding CUSOLVERRF library common */
+  struct cusolverRfCommon;
+  typedef struct cusolverRfCommon* cusolverRfHandle_t;
+
+  /* CUSOLVERRF create (allocate memory) and destroy (free memory) in the handle
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverRfCreate(cusolverRfHandle_t* handle);
+  cusolverStatus_t CUSOLVERAPI cusolverRfDestroy(cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF set and get input format */
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetMatrixFormat(
+    cusolverRfHandle_t        handle,
+    cusolverRfMatrixFormat_t* format,
+    cusolverRfUnitDiagonal_t* diag);
+
+  cusolverStatus_t CUSOLVERAPI cusolverRfSetMatrixFormat(
+    cusolverRfHandle_t       handle,
+    cusolverRfMatrixFormat_t format,
+    cusolverRfUnitDiagonal_t diag);
+
+  /* CUSOLVERRF set and get numeric properties */
+  cusolverStatus_t CUSOLVERAPI cusolverRfSetNumericProperties(
+    cusolverRfHandle_t handle,
+    double             zero,
+    double             boost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetNumericProperties(
+    cusolverRfHandle_t handle,
+    double*            zero,
+    double*            boost);
+
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetNumericBoostReport(
+    cusolverRfHandle_t              handle,
+    cusolverRfNumericBoostReport_t* report);
+
+  /* CUSOLVERRF choose the triangular solve algorithm */
+  cusolverStatus_t CUSOLVERAPI cusolverRfSetAlgs(
+    cusolverRfHandle_t          handle,
+    cusolverRfFactorization_t   factAlg,
+    cusolverRfTriangularSolve_t solveAlg);
+
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetAlgs(
+    cusolverRfHandle_t           handle,
+    cusolverRfFactorization_t*   factAlg,
+    cusolverRfTriangularSolve_t* solveAlg);
+
+  /* CUSOLVERRF set and get fast mode */
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetResetValuesFastMode(
+    cusolverRfHandle_t               handle,
+    cusolverRfResetValuesFastMode_t* fastMode);
+
+  cusolverStatus_t CUSOLVERAPI cusolverRfSetResetValuesFastMode(
+    cusolverRfHandle_t              handle,
+    cusolverRfResetValuesFastMode_t fastMode);
+
+  /*** Non-Batched Routines ***/
+  /* CUSOLVERRF setup of internal structures from host or device memory */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfSetupHost(/* Input (in the host memory) */
+                        int     n,
+                        int     nnzA,
+                        int*    h_csrRowPtrA,
+                        int*    h_csrColIndA,
+                        double* h_csrValA,
+                        int     nnzL,
+                        int*    h_csrRowPtrL,
+                        int*    h_csrColIndL,
+                        double* h_csrValL,
+                        int     nnzU,
+                        int*    h_csrRowPtrU,
+                        int*    h_csrColIndU,
+                        double* h_csrValU,
+                        int*    h_P,
+                        int*    h_Q,
+                        /* Output */
+                        cusolverRfHandle_t handle);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfSetupDevice(/* Input (in the device memory) */
+                          int     n,
+                          int     nnzA,
+                          int*    csrRowPtrA,
+                          int*    csrColIndA,
+                          double* csrValA,
+                          int     nnzL,
+                          int*    csrRowPtrL,
+                          int*    csrColIndL,
+                          double* csrValL,
+                          int     nnzU,
+                          int*    csrRowPtrU,
+                          int*    csrColIndU,
+                          double* csrValU,
+                          int*    P,
+                          int*    Q,
+                          /* Output */
+                          cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF update the matrix values (assuming the reordering, pivoting
+     and consequently the sparsity pattern of L and U did not change),
+     and zero out the remaining values. */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfResetValues(/* Input (in the device memory) */
+                          int     n,
+                          int     nnzA,
+                          int*    csrRowPtrA,
+                          int*    csrColIndA,
+                          double* csrValA,
+                          int*    P,
+                          int*    Q,
+                          /* Output */
+                          cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF analysis (for parallelism) */
+  cusolverStatus_t CUSOLVERAPI cusolverRfAnalyze(cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF re-factorization (for parallelism) */
+  cusolverStatus_t CUSOLVERAPI cusolverRfRefactor(cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF extraction: Get L & U packed into a single matrix M */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfAccessBundledFactorsDevice(/* Input */
+                                         cusolverRfHandle_t handle,
+                                         /* Output (in the host memory) */
+                                         int* nnzM,
+                                         /* Output (in the device memory) */
+                                         int**    Mp,
+                                         int**    Mi,
+                                         double** Mx);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfExtractBundledFactorsHost(/* Input */
+                                        cusolverRfHandle_t handle,
+                                        /* Output (in the host memory) */
+                                        int*     h_nnzM,
+                                        int**    h_Mp,
+                                        int**    h_Mi,
+                                        double** h_Mx);
+
+  /* CUSOLVERRF extraction: Get L & U individually */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfExtractSplitFactorsHost(/* Input */
+                                      cusolverRfHandle_t handle,
+                                      /* Output (in the host memory) */
+                                      int*     h_nnzL,
+                                      int**    h_csrRowPtrL,
+                                      int**    h_csrColIndL,
+                                      double** h_csrValL,
+                                      int*     h_nnzU,
+                                      int**    h_csrRowPtrU,
+                                      int**    h_csrColIndU,
+                                      double** h_csrValU);
+
+  /* CUSOLVERRF (forward and backward triangular) solves */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfSolve(/* Input (in the device memory) */
+                    cusolverRfHandle_t handle,
+                    int*               P,
+                    int*               Q,
+                    int                nrhs, // only nrhs=1 is supported
+                    double*            Temp, // of size ldt*nrhs (ldt>=n)
+                    int                ldt,
+                    /* Input/Output (in the device memory) */
+                    double* XF,
+                    /* Input */
+                    int ldxf);
+
+  /*** Batched Routines ***/
+  /* CUSOLVERRF-batch setup of internal structures from host */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchSetupHost(/* Input (in the host memory)*/
+                             int     batchSize,
+                             int     n,
+                             int     nnzA,
+                             int*    h_csrRowPtrA,
+                             int*    h_csrColIndA,
+                             double* h_csrValA_array[],
+                             int     nnzL,
+                             int*    h_csrRowPtrL,
+                             int*    h_csrColIndL,
+                             double* h_csrValL,
+                             int     nnzU,
+                             int*    h_csrRowPtrU,
+                             int*    h_csrColIndU,
+                             double* h_csrValU,
+                             int*    h_P,
+                             int*    h_Q,
+                             /* Output (in the device memory) */
+                             cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF-batch update the matrix values (assuming the reordering,
+     pivoting and consequently the sparsity pattern of L and U did not change),
+     and zero out the remaining values. */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchResetValues(/* Input (in the device memory) */
+                               int     batchSize,
+                               int     n,
+                               int     nnzA,
+                               int*    csrRowPtrA,
+                               int*    csrColIndA,
+                               double* csrValA_array[],
+                               int*    P,
+                               int*    Q,
+                               /* Output */
+                               cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF-batch analysis (for parallelism) */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchAnalyze(cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF-batch re-factorization (for parallelism) */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchRefactor(cusolverRfHandle_t handle);
+
+  /* CUSOLVERRF-batch (forward and backward triangular) solves */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchSolve(/* Input (in the device memory) */
+                         cusolverRfHandle_t handle,
+                         int*               P,
+                         int*               Q,
+                         int                nrhs, // only nrhs=1 is supported
+                         double* Temp, // of size 2*batchSize*(n*nrhs)
+                         int     ldt,  // only ldt=n is supported
+                         /* Input/Output (in the device memory) */
+                         double* XF_array[],
+                         /* Input */
+                         int ldxf);
+
+  /* CUSOLVERRF-batch obtain the position of zero pivot */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchZeroPivot(/* Input */
+                             cusolverRfHandle_t handle,
+                             /* Output (in the host memory) */
+                             int* position);
+
+  #if defined(__cplusplus)
+}
+  #endif /* __cplusplus */
+
+#endif /* CUSOLVERRF_H_ */
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverSp.h b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverSp.h
new file mode 100644
index 0000000000000000000000000000000000000000..a00a2fac14664090a116bae89fe34f97d8e41f9c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverSp.h
@@ -0,0 +1,923 @@
+/*
+ * Copyright 2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CUSOLVERSP_H_)
+  #define CUSOLVERSP_H_
+
+  #include "cusparse.h"
+  #include "cublas_v2.h"
+  #include "cusolver_common.h"
+
+  #if defined(__cplusplus)
+extern "C" {
+  #endif /* __cplusplus */
+
+  struct cusolverSpContext;
+  typedef struct cusolverSpContext *cusolverSpHandle_t;
+
+  struct csrqrInfo;
+  typedef struct csrqrInfo *csrqrInfo_t;
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCreate(cusolverSpHandle_t *handle);
+  cusolverStatus_t CUSOLVERAPI cusolverSpDestroy(cusolverSpHandle_t handle);
+  cusolverStatus_t CUSOLVERAPI
+    cusolverSpSetStream(cusolverSpHandle_t handle, cudaStream_t streamId);
+  cusolverStatus_t CUSOLVERAPI
+    cusolverSpGetStream(cusolverSpHandle_t handle, cudaStream_t *streamId);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrissymHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrEndPtrA,
+    const int *              csrColIndA,
+    int *                    issym);
+
+  /* -------- GPU linear solver by LU factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [lu] stands for LU factorization
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvluHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    float *                  x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvluHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    double *                 x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvluHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    cuComplex *              x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvluHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    cuDoubleComplex *        x,
+    int *                    singularity);
+
+  /* -------- GPU linear solver by QR factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [qr] stands for QR factorization
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvqr(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    float *                  x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvqr(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    double *                 x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvqr(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    cuComplex *              x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvqr(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    cuDoubleComplex *        x,
+    int *                    singularity);
+
+  /* -------- CPU linear solver by QR factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [qr] stands for QR factorization
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    float *                  x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    double *                 x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    cuComplex *              x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    cuDoubleComplex *        x,
+    int *                    singularity);
+
+  /* -------- CPU linear solver by Cholesky factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [chol] stands for Cholesky factorization
+   *
+   * Only works for symmetric positive definite matrix.
+   * The upper part of A is ignored.
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvcholHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    float *                  x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvcholHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    double *                 x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvcholHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    cuComplex *              x,
+    int *                    singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvcholHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    cuDoubleComplex *        x,
+    int *                    singularity);
+
+  /* -------- GPU linear solver by Cholesky factorization
+   *       solve A*x = b, A can be singular
+   * [ls] stands for linear solve
+   * [v] stands for vector
+   * [chol] stands for Cholesky factorization
+   *
+   * Only works for symmetric positive definite matrix.
+   * The upper part of A is ignored.
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsvchol(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const float *            b,
+    float                    tol,
+    int                      reorder,
+    // output
+    float *x,
+    int *  singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsvchol(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const double *           b,
+    double                   tol,
+    int                      reorder,
+    // output
+    double *x,
+    int *   singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsvchol(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuComplex *        b,
+    float                    tol,
+    int                      reorder,
+    // output
+    cuComplex *x,
+    int *      singularity);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsvchol(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int                      reorder,
+    // output
+    cuDoubleComplex *x,
+    int *            singularity);
+
+  /* ----------- CPU least square solver by QR factorization
+   *       solve min|b - A*x|
+   * [lsq] stands for least square
+   * [v] stands for vector
+   * [qr] stands for QR factorization
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrlsqvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const float *            b,
+    float                    tol,
+    int *                    rankA,
+    float *                  x,
+    int *                    p,
+    float *                  min_norm);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrlsqvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const double *           b,
+    double                   tol,
+    int *                    rankA,
+    double *                 x,
+    int *                    p,
+    double *                 min_norm);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrlsqvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuComplex *        b,
+    float                    tol,
+    int *                    rankA,
+    cuComplex *              x,
+    int *                    p,
+    float *                  min_norm);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrlsqvqrHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuDoubleComplex *  b,
+    double                   tol,
+    int *                    rankA,
+    cuDoubleComplex *        x,
+    int *                    p,
+    double *                 min_norm);
+
+  /* --------- CPU eigenvalue solver by shift inverse
+   *      solve A*x = lambda * x
+   *   where lambda is the eigenvalue nearest mu0.
+   * [eig] stands for eigenvalue solver
+   * [si] stands for shift-inverse
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsreigvsiHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    float                    mu0,
+    const float *            x0,
+    int                      maxite,
+    float                    tol,
+    float *                  mu,
+    float *                  x);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigvsiHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    double                   mu0,
+    const double *           x0,
+    int                      maxite,
+    double                   tol,
+    double *                 mu,
+    double *                 x);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigvsiHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                mu0,
+    const cuComplex *        x0,
+    int                      maxite,
+    float                    tol,
+    cuComplex *              mu,
+    cuComplex *              x);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigvsiHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          mu0,
+    const cuDoubleComplex *  x0,
+    int                      maxite,
+    double                   tol,
+    cuDoubleComplex *        mu,
+    cuDoubleComplex *        x);
+
+  /* --------- GPU eigenvalue solver by shift inverse
+   *      solve A*x = lambda * x
+   *   where lambda is the eigenvalue nearest mu0.
+   * [eig] stands for eigenvalue solver
+   * [si] stands for shift-inverse
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsreigvsi(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    float                    mu0,
+    const float *            x0,
+    int                      maxite,
+    float                    eps,
+    float *                  mu,
+    float *                  x);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigvsi(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    double                   mu0,
+    const double *           x0,
+    int                      maxite,
+    double                   eps,
+    double *                 mu,
+    double *                 x);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigvsi(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                mu0,
+    const cuComplex *        x0,
+    int                      maxite,
+    float                    eps,
+    cuComplex *              mu,
+    cuComplex *              x);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigvsi(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          mu0,
+    const cuDoubleComplex *  x0,
+    int                      maxite,
+    double                   eps,
+    cuDoubleComplex *        mu,
+    cuDoubleComplex *        x);
+
+  // ----------- enclosed eigenvalues
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsreigsHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                left_bottom_corner,
+    cuComplex                right_upper_corner,
+    int *                    num_eigs);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsreigsHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          left_bottom_corner,
+    cuDoubleComplex          right_upper_corner,
+    int *                    num_eigs);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsreigsHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                left_bottom_corner,
+    cuComplex                right_upper_corner,
+    int *                    num_eigs);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsreigsHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          left_bottom_corner,
+    cuDoubleComplex          right_upper_corner,
+    int *                    num_eigs);
+
+  /* --------- CPU symrcm
+   *   Symmetric reverse Cuthill McKee permutation
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymrcmHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    p);
+
+  /* --------- CPU symmdq
+   *   Symmetric minimum degree algorithm by quotient graph
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymmdqHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    p);
+
+  /* --------- CPU symmdq
+   *   Symmetric Approximate minimum degree algorithm by quotient graph
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrsymamdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    p);
+
+  /* --------- CPU metis
+   *   symmetric reordering
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrmetisndHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const int64_t *          options,
+    int *                    p);
+
+  /* --------- CPU zfd
+   *  Zero free diagonal reordering
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrzfdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    P,
+    int *                    numnz);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrzfdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    P,
+    int *                    numnz);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrzfdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    P,
+    int *                    numnz);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrzfdHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    int *                    P,
+    int *                    numnz);
+
+  /* --------- CPU permuation
+   *   P*A*Q^T
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrperm_bufferSizeHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const int *              p,
+    const int *              q,
+    size_t *                 bufferSizeInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrpermHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    int *                    csrRowPtrA,
+    int *                    csrColIndA,
+    const int *              p,
+    const int *              q,
+    int *                    map,
+    void *                   pBuffer);
+
+  /*
+   *  Low-level API: Batched QR
+   *
+   */
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCreateCsrqrInfo(csrqrInfo_t *info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDestroyCsrqrInfo(csrqrInfo_t info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrqrAnalysisBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfo_t              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrBufferInfoBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrBufferInfoBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrBufferInfoBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrBufferInfoBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrVal,
+    const int *              csrRowPtr,
+    const int *              csrColInd,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrsvBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const float *            b,
+    float *                  x,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrsvBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const double *           b,
+    double *                 x,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrsvBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuComplex *        b,
+    cuComplex *              x,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrsvBatched(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnz,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    const cuDoubleComplex *  b,
+    cuDoubleComplex *        x,
+    int                      batchSize,
+    csrqrInfo_t              info,
+    void *                   pBuffer);
+
+  #if defined(__cplusplus)
+}
+  #endif /* __cplusplus */
+
+#endif // define CUSOLVERSP_H_
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverSp_LOWLEVEL_PREVIEW.h b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverSp_LOWLEVEL_PREVIEW.h
new file mode 100644
index 0000000000000000000000000000000000000000..e660bb87ea5d89cc1d430dce6c50df006d796809
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverSp_LOWLEVEL_PREVIEW.h
@@ -0,0 +1,1107 @@
+/*
+ * Copyright 2015 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CUSOLVERSP_LOWLEVEL_PREVIEW_H_)
+  #define CUSOLVERSP_LOWLEVEL_PREVIEW_H_
+
+  #include "cusolverSp.h"
+
+  #if defined(__cplusplus)
+extern "C" {
+  #endif /* __cplusplus */
+
+  struct csrluInfoHost;
+  typedef struct csrluInfoHost *csrluInfoHost_t;
+
+  struct csrqrInfoHost;
+  typedef struct csrqrInfoHost *csrqrInfoHost_t;
+
+  struct csrcholInfoHost;
+  typedef struct csrcholInfoHost *csrcholInfoHost_t;
+
+  struct csrcholInfo;
+  typedef struct csrcholInfo *csrcholInfo_t;
+
+  /*
+   * Low level API for CPU LU
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverSpCreateCsrluInfoHost(csrluInfoHost_t *info);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverSpDestroyCsrluInfoHost(csrluInfoHost_t info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrluAnalysisHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrluInfoHost_t          info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrluBufferInfoHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrluInfoHost_t          info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrluBufferInfoHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrluInfoHost_t          info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrluBufferInfoHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrluInfoHost_t          info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrluBufferInfoHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrluInfoHost_t          info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrluFactorHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrluInfoHost_t          info,
+    float                    pivot_threshold,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrluFactorHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrluInfoHost_t          info,
+    double                   pivot_threshold,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrluFactorHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrluInfoHost_t          info,
+    float                    pivot_threshold,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrluFactorHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrluInfoHost_t          info,
+    double                   pivot_threshold,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrluZeroPivotHost(
+    cusolverSpHandle_t handle,
+    csrluInfoHost_t    info,
+    float              tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrluZeroPivotHost(
+    cusolverSpHandle_t handle,
+    csrluInfoHost_t    info,
+    double             tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrluZeroPivotHost(
+    cusolverSpHandle_t handle,
+    csrluInfoHost_t    info,
+    float              tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrluZeroPivotHost(
+    cusolverSpHandle_t handle,
+    csrluInfoHost_t    info,
+    double             tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrluSolveHost(
+    cusolverSpHandle_t handle,
+    int                n,
+    const float *      b,
+    float *            x,
+    csrluInfoHost_t    info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrluSolveHost(
+    cusolverSpHandle_t handle,
+    int                n,
+    const double *     b,
+    double *           x,
+    csrluInfoHost_t    info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrluSolveHost(
+    cusolverSpHandle_t handle,
+    int                n,
+    const cuComplex *  b,
+    cuComplex *        x,
+    csrluInfoHost_t    info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrluSolveHost(
+    cusolverSpHandle_t     handle,
+    int                    n,
+    const cuDoubleComplex *b,
+    cuDoubleComplex *      x,
+    csrluInfoHost_t        info,
+    void *                 pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrluNnzHost(
+    cusolverSpHandle_t handle,
+    int *              nnzLRef,
+    int *              nnzURef,
+    csrluInfoHost_t    info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrluExtractHost(
+    cusolverSpHandle_t       handle,
+    int *                    P,
+    int *                    Q,
+    const cusparseMatDescr_t descrL,
+    float *                  csrValL,
+    int *                    csrRowPtrL,
+    int *                    csrColIndL,
+    const cusparseMatDescr_t descrU,
+    float *                  csrValU,
+    int *                    csrRowPtrU,
+    int *                    csrColIndU,
+    csrluInfoHost_t          info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrluExtractHost(
+    cusolverSpHandle_t       handle,
+    int *                    P,
+    int *                    Q,
+    const cusparseMatDescr_t descrL,
+    double *                 csrValL,
+    int *                    csrRowPtrL,
+    int *                    csrColIndL,
+    const cusparseMatDescr_t descrU,
+    double *                 csrValU,
+    int *                    csrRowPtrU,
+    int *                    csrColIndU,
+    csrluInfoHost_t          info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrluExtractHost(
+    cusolverSpHandle_t       handle,
+    int *                    P,
+    int *                    Q,
+    const cusparseMatDescr_t descrL,
+    cuComplex *              csrValL,
+    int *                    csrRowPtrL,
+    int *                    csrColIndL,
+    const cusparseMatDescr_t descrU,
+    cuComplex *              csrValU,
+    int *                    csrRowPtrU,
+    int *                    csrColIndU,
+    csrluInfoHost_t          info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrluExtractHost(
+    cusolverSpHandle_t       handle,
+    int *                    P,
+    int *                    Q,
+    const cusparseMatDescr_t descrL,
+    cuDoubleComplex *        csrValL,
+    int *                    csrRowPtrL,
+    int *                    csrColIndL,
+    const cusparseMatDescr_t descrU,
+    cuDoubleComplex *        csrValU,
+    int *                    csrRowPtrU,
+    int *                    csrColIndU,
+    csrluInfoHost_t          info,
+    void *                   pBuffer);
+
+  /*
+   * Low level API for CPU QR
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverSpCreateCsrqrInfoHost(csrqrInfoHost_t *info);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverSpDestroyCsrqrInfoHost(csrqrInfoHost_t info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrqrAnalysisHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfoHost_t          info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrBufferInfoHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfoHost_t          info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrBufferInfoHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfoHost_t          info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrBufferInfoHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfoHost_t          info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrBufferInfoHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfoHost_t          info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrSetupHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    float                    mu,
+    csrqrInfoHost_t          info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrSetupHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    double                   mu,
+    csrqrInfoHost_t          info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrSetupHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                mu,
+    csrqrInfoHost_t          info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrSetupHost(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          mu,
+    csrqrInfoHost_t          info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrFactorHost(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    int                nnzA,
+    float *            b,
+    float *            x,
+    csrqrInfoHost_t    info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrFactorHost(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    int                nnzA,
+    double *           b,
+    double *           x,
+    csrqrInfoHost_t    info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrFactorHost(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    int                nnzA,
+    cuComplex *        b,
+    cuComplex *        x,
+    csrqrInfoHost_t    info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrFactorHost(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    int                nnzA,
+    cuDoubleComplex *  b,
+    cuDoubleComplex *  x,
+    csrqrInfoHost_t    info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrZeroPivotHost(
+    cusolverSpHandle_t handle,
+    csrqrInfoHost_t    info,
+    float              tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrZeroPivotHost(
+    cusolverSpHandle_t handle,
+    csrqrInfoHost_t    info,
+    double             tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrZeroPivotHost(
+    cusolverSpHandle_t handle,
+    csrqrInfoHost_t    info,
+    float              tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrZeroPivotHost(
+    cusolverSpHandle_t handle,
+    csrqrInfoHost_t    info,
+    double             tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrSolveHost(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    float *            b,
+    float *            x,
+    csrqrInfoHost_t    info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrSolveHost(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    double *           b,
+    double *           x,
+    csrqrInfoHost_t    info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrSolveHost(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    cuComplex *        b,
+    cuComplex *        x,
+    csrqrInfoHost_t    info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrSolveHost(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    cuDoubleComplex *  b,
+    cuDoubleComplex *  x,
+    csrqrInfoHost_t    info,
+    void *             pBuffer);
+
+  /*
+   * Low level API for GPU QR
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrqrAnalysis(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfo_t              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrBufferInfo(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrBufferInfo(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrBufferInfo(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrBufferInfo(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrqrInfo_t              info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrSetup(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    float                    mu,
+    csrqrInfo_t              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrSetup(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    double                   mu,
+    csrqrInfo_t              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrSetup(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuComplex                mu,
+    csrqrInfo_t              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrSetup(
+    cusolverSpHandle_t       handle,
+    int                      m,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    cuDoubleComplex          mu,
+    csrqrInfo_t              info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrFactor(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    int                nnzA,
+    float *            b,
+    float *            x,
+    csrqrInfo_t        info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrFactor(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    int                nnzA,
+    double *           b,
+    double *           x,
+    csrqrInfo_t        info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrFactor(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    int                nnzA,
+    cuComplex *        b,
+    cuComplex *        x,
+    csrqrInfo_t        info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrFactor(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    int                nnzA,
+    cuDoubleComplex *  b,
+    cuDoubleComplex *  x,
+    csrqrInfo_t        info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrZeroPivot(
+    cusolverSpHandle_t handle,
+    csrqrInfo_t        info,
+    float              tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrZeroPivot(
+    cusolverSpHandle_t handle,
+    csrqrInfo_t        info,
+    double             tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrZeroPivot(
+    cusolverSpHandle_t handle,
+    csrqrInfo_t        info,
+    float              tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrZeroPivot(
+    cusolverSpHandle_t handle,
+    csrqrInfo_t        info,
+    double             tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrqrSolve(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    float *            b,
+    float *            x,
+    csrqrInfo_t        info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrqrSolve(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    double *           b,
+    double *           x,
+    csrqrInfo_t        info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrqrSolve(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    cuComplex *        b,
+    cuComplex *        x,
+    csrqrInfo_t        info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrqrSolve(
+    cusolverSpHandle_t handle,
+    int                m,
+    int                n,
+    cuDoubleComplex *  b,
+    cuDoubleComplex *  x,
+    csrqrInfo_t        info,
+    void *             pBuffer);
+
+  /*
+   * Low level API for CPU Cholesky
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverSpCreateCsrcholInfoHost(csrcholInfoHost_t *info);
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverSpDestroyCsrcholInfoHost(csrcholInfoHost_t info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholAnalysisHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfoHost_t        info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrcholBufferInfoHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfoHost_t        info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrcholBufferInfoHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfoHost_t        info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrcholBufferInfoHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfoHost_t        info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrcholBufferInfoHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfoHost_t        info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrcholFactorHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfoHost_t        info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrcholFactorHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfoHost_t        info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrcholFactorHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfoHost_t        info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrcholFactorHost(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfoHost_t        info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrcholZeroPivotHost(
+    cusolverSpHandle_t handle,
+    csrcholInfoHost_t  info,
+    float              tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrcholZeroPivotHost(
+    cusolverSpHandle_t handle,
+    csrcholInfoHost_t  info,
+    double             tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrcholZeroPivotHost(
+    cusolverSpHandle_t handle,
+    csrcholInfoHost_t  info,
+    float              tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrcholZeroPivotHost(
+    cusolverSpHandle_t handle,
+    csrcholInfoHost_t  info,
+    double             tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrcholSolveHost(
+    cusolverSpHandle_t handle,
+    int                n,
+    const float *      b,
+    float *            x,
+    csrcholInfoHost_t  info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrcholSolveHost(
+    cusolverSpHandle_t handle,
+    int                n,
+    const double *     b,
+    double *           x,
+    csrcholInfoHost_t  info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrcholSolveHost(
+    cusolverSpHandle_t handle,
+    int                n,
+    const cuComplex *  b,
+    cuComplex *        x,
+    csrcholInfoHost_t  info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrcholSolveHost(
+    cusolverSpHandle_t     handle,
+    int                    n,
+    const cuDoubleComplex *b,
+    cuDoubleComplex *      x,
+    csrcholInfoHost_t      info,
+    void *                 pBuffer);
+
+  /*
+   * Low level API for GPU Cholesky
+   *
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpCreateCsrcholInfo(csrcholInfo_t *info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDestroyCsrcholInfo(csrcholInfo_t info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpXcsrcholAnalysis(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfo_t            info);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrcholBufferInfo(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfo_t            info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrcholBufferInfo(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfo_t            info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrcholBufferInfo(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfo_t            info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrcholBufferInfo(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfo_t            info,
+    size_t *                 internalDataInBytes,
+    size_t *                 workspaceInBytes);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrcholFactor(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const float *            csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfo_t            info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrcholFactor(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const double *           csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfo_t            info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrcholFactor(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuComplex *        csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfo_t            info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrcholFactor(
+    cusolverSpHandle_t       handle,
+    int                      n,
+    int                      nnzA,
+    const cusparseMatDescr_t descrA,
+    const cuDoubleComplex *  csrValA,
+    const int *              csrRowPtrA,
+    const int *              csrColIndA,
+    csrcholInfo_t            info,
+    void *                   pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrcholZeroPivot(
+    cusolverSpHandle_t handle,
+    csrcholInfo_t      info,
+    float              tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrcholZeroPivot(
+    cusolverSpHandle_t handle,
+    csrcholInfo_t      info,
+    double             tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrcholZeroPivot(
+    cusolverSpHandle_t handle,
+    csrcholInfo_t      info,
+    float              tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrcholZeroPivot(
+    cusolverSpHandle_t handle,
+    csrcholInfo_t      info,
+    double             tol,
+    int *              position);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrcholSolve(
+    cusolverSpHandle_t handle,
+    int                n,
+    const float *      b,
+    float *            x,
+    csrcholInfo_t      info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrcholSolve(
+    cusolverSpHandle_t handle,
+    int                n,
+    const double *     b,
+    double *           x,
+    csrcholInfo_t      info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrcholSolve(
+    cusolverSpHandle_t handle,
+    int                n,
+    const cuComplex *  b,
+    cuComplex *        x,
+    csrcholInfo_t      info,
+    void *             pBuffer);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrcholSolve(
+    cusolverSpHandle_t     handle,
+    int                    n,
+    const cuDoubleComplex *b,
+    cuDoubleComplex *      x,
+    csrcholInfo_t          info,
+    void *                 pBuffer);
+
+  /*
+   * "diag" is a device array of size N.
+   * cusolverSp<t>csrcholDiag returns diag(L) to "diag" where A(P,P) = L*L**T
+   * "diag" can estimate det(A) because det(A(P,P)) = det(A) = det(L)^2 if A =
+   * L*L**T.
+   *
+   * cusolverSp<t>csrcholDiag must be called after cusolverSp<t>csrcholFactor.
+   * otherwise "diag" is wrong.
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverSpScsrcholDiag(
+    cusolverSpHandle_t handle,
+    csrcholInfo_t      info,
+    float *            diag);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpDcsrcholDiag(
+    cusolverSpHandle_t handle,
+    csrcholInfo_t      info,
+    double *           diag);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpCcsrcholDiag(
+    cusolverSpHandle_t handle,
+    csrcholInfo_t      info,
+    float *            diag);
+
+  cusolverStatus_t CUSOLVERAPI cusolverSpZcsrcholDiag(
+    cusolverSpHandle_t handle,
+    csrcholInfo_t      info,
+    double *           diag);
+
+  #if defined(__cplusplus)
+}
+  #endif /* __cplusplus */
+
+#endif // CUSOLVERSP_LOWLEVEL_PREVIEW_H_
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolver_common.h b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolver_common.h
new file mode 100644
index 0000000000000000000000000000000000000000..204dffef076fbce62066e98a5a8b041695fc7aad
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolver_common.h
@@ -0,0 +1,261 @@
+/*
+ * Copyright 2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+
+#if !defined(CUSOLVER_COMMON_H_)
+  #define CUSOLVER_COMMON_H_
+
+  #include "library_types.h"
+
+  #ifndef CUSOLVERAPI
+    #ifdef _WIN32
+      #define CUSOLVERAPI __stdcall
+    #else
+      #define CUSOLVERAPI
+    #endif
+  #endif
+
+  #if defined(_MSC_VER)
+typedef __int64 int64_t;
+  #else
+    #include <inttypes.h>
+  #endif
+
+typedef int cusolver_int_t;
+
+  #define CUSOLVER_VER_MAJOR 11
+  #define CUSOLVER_VER_MINOR 6
+  #define CUSOLVER_VER_PATCH 1
+  #define CUSOLVER_VER_BUILD 9
+  #define CUSOLVER_VERSION                                                     \
+    (CUSOLVER_VER_MAJOR * 1000 + CUSOLVER_VER_MINOR * 100 + CUSOLVER_VER_PATCH)
+
+//------------------------------------------------------------------------------
+
+  #if !defined(_MSC_VER)
+    #define CUSOLVER_CPP_VERSION __cplusplus
+  #elif _MSC_FULL_VER >= 190024210 // Visual Studio 2015 Update 3
+    #define CUSOLVER_CPP_VERSION _MSVC_LANG
+  #else
+    #define CUSOLVER_CPP_VERSION 0
+  #endif
+
+//------------------------------------------------------------------------------
+
+  #if !defined(DISABLE_CUSOLVER_DEPRECATED)
+
+    #if CUSOLVER_CPP_VERSION >= 201402L
+
+      #define CUSOLVER_DEPRECATED(new_func)                                    \
+        [[deprecated("please use " #new_func " instead")]]
+
+    #elif defined(_MSC_VER)
+
+      #define CUSOLVER_DEPRECATED(new_func)                                    \
+        __declspec(deprecated("please use " #new_func " instead"))
+
+    #elif defined(__INTEL_COMPILER) || defined(__clang__) ||                   \
+      (defined(__GNUC__) &&                                                    \
+       (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+
+      #define CUSOLVER_DEPRECATED(new_func)                                    \
+        __attribute__((deprecated("please use " #new_func " instead")))
+
+    #elif defined(__GNUC__) || defined(__xlc__)
+
+      #define CUSOLVER_DEPRECATED(new_func) __attribute__((deprecated))
+
+    #else
+
+      #define CUSOLVER_DEPRECATED(new_func)
+
+    #endif // defined(__cplusplus) && __cplusplus >= 201402L
+  //------------------------------------------------------------------------------
+
+    #if CUSOLVER_CPP_VERSION >= 201703L
+
+      #define CUSOLVER_DEPRECATED_ENUM(new_enum)                               \
+        [[deprecated("please use " #new_enum " instead")]]
+
+    #elif defined(__clang__) ||                                                \
+      (defined(__GNUC__) && __GNUC__ >= 6 && !defined(__PGI))
+
+      #define CUSOLVER_DEPRECATED_ENUM(new_enum)                               \
+        __attribute__((deprecated("please use " #new_enum " instead")))
+
+    #else
+
+      #define CUSOLVER_DEPRECATED_ENUM(new_enum)
+
+    #endif // defined(__cplusplus) && __cplusplus >= 201402L
+
+  #else // defined(DISABLE_CUSOLVER_DEPRECATED)
+
+    #define CUSOLVER_DEPRECATED(new_func)
+    #define CUSOLVER_DEPRECATED_ENUM(new_enum)
+
+  #endif // !defined(DISABLE_CUSOLVER_DEPRECATED)
+
+  #undef CUSOLVER_CPP_VERSION
+
+  #if defined(__cplusplus)
+extern "C" {
+  #endif /* __cplusplus */
+
+  typedef enum {
+    CUSOLVER_STATUS_SUCCESS = 0,
+    CUSOLVER_STATUS_NOT_INITIALIZED = 1,
+    CUSOLVER_STATUS_ALLOC_FAILED = 2,
+    CUSOLVER_STATUS_INVALID_VALUE = 3,
+    CUSOLVER_STATUS_ARCH_MISMATCH = 4,
+    CUSOLVER_STATUS_MAPPING_ERROR = 5,
+    CUSOLVER_STATUS_EXECUTION_FAILED = 6,
+    CUSOLVER_STATUS_INTERNAL_ERROR = 7,
+    CUSOLVER_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8,
+    CUSOLVER_STATUS_NOT_SUPPORTED = 9,
+    CUSOLVER_STATUS_ZERO_PIVOT = 10,
+    CUSOLVER_STATUS_INVALID_LICENSE = 11,
+    CUSOLVER_STATUS_IRS_PARAMS_NOT_INITIALIZED = 12,
+    CUSOLVER_STATUS_IRS_PARAMS_INVALID = 13,
+    CUSOLVER_STATUS_IRS_PARAMS_INVALID_PREC = 14,
+    CUSOLVER_STATUS_IRS_PARAMS_INVALID_REFINE = 15,
+    CUSOLVER_STATUS_IRS_PARAMS_INVALID_MAXITER = 16,
+    CUSOLVER_STATUS_IRS_INTERNAL_ERROR = 20,
+    CUSOLVER_STATUS_IRS_NOT_SUPPORTED = 21,
+    CUSOLVER_STATUS_IRS_OUT_OF_RANGE = 22,
+    CUSOLVER_STATUS_IRS_NRHS_NOT_SUPPORTED_FOR_REFINE_GMRES = 23,
+    CUSOLVER_STATUS_IRS_INFOS_NOT_INITIALIZED = 25,
+    CUSOLVER_STATUS_IRS_INFOS_NOT_DESTROYED = 26,
+    CUSOLVER_STATUS_IRS_MATRIX_SINGULAR = 30,
+    CUSOLVER_STATUS_INVALID_WORKSPACE = 31
+  } cusolverStatus_t;
+
+  typedef enum {
+    CUSOLVER_EIG_TYPE_1 = 1,
+    CUSOLVER_EIG_TYPE_2 = 2,
+    CUSOLVER_EIG_TYPE_3 = 3
+  } cusolverEigType_t;
+
+  typedef enum {
+    CUSOLVER_EIG_MODE_NOVECTOR = 0,
+    CUSOLVER_EIG_MODE_VECTOR = 1
+  } cusolverEigMode_t;
+
+  typedef enum {
+    CUSOLVER_EIG_RANGE_ALL = 1001,
+    CUSOLVER_EIG_RANGE_I = 1002,
+    CUSOLVER_EIG_RANGE_V = 1003,
+  } cusolverEigRange_t;
+
+  typedef enum {
+    CUSOLVER_INF_NORM = 104,
+    CUSOLVER_MAX_NORM = 105,
+    CUSOLVER_ONE_NORM = 106,
+    CUSOLVER_FRO_NORM = 107,
+  } cusolverNorm_t;
+
+  typedef enum {
+    CUSOLVER_IRS_REFINE_NOT_SET = 1100,
+    CUSOLVER_IRS_REFINE_NONE = 1101,
+    CUSOLVER_IRS_REFINE_CLASSICAL = 1102,
+    CUSOLVER_IRS_REFINE_CLASSICAL_GMRES = 1103,
+    CUSOLVER_IRS_REFINE_GMRES = 1104,
+    CUSOLVER_IRS_REFINE_GMRES_GMRES = 1105,
+    CUSOLVER_IRS_REFINE_GMRES_NOPCOND = 1106,
+
+    CUSOLVER_PREC_DD = 1150,
+    CUSOLVER_PREC_SS = 1151,
+    CUSOLVER_PREC_SHT = 1152,
+
+  } cusolverIRSRefinement_t;
+
+  typedef enum {
+    CUSOLVER_R_8I = 1201,
+    CUSOLVER_R_8U = 1202,
+    CUSOLVER_R_64F = 1203,
+    CUSOLVER_R_32F = 1204,
+    CUSOLVER_R_16F = 1205,
+    CUSOLVER_R_16BF = 1206,
+    CUSOLVER_R_TF32 = 1207,
+    CUSOLVER_R_AP = 1208,
+    CUSOLVER_C_8I = 1211,
+    CUSOLVER_C_8U = 1212,
+    CUSOLVER_C_64F = 1213,
+    CUSOLVER_C_32F = 1214,
+    CUSOLVER_C_16F = 1215,
+    CUSOLVER_C_16BF = 1216,
+    CUSOLVER_C_TF32 = 1217,
+    CUSOLVER_C_AP = 1218,
+  } cusolverPrecType_t;
+
+  typedef enum {
+    CUSOLVER_ALG_0 = 0, /* default algorithm */
+    CUSOLVER_ALG_1 = 1,
+    CUSOLVER_ALG_2 = 2
+  } cusolverAlgMode_t;
+
+  typedef enum {
+    CUBLAS_STOREV_COLUMNWISE = 0,
+    CUBLAS_STOREV_ROWWISE = 1
+  } cusolverStorevMode_t;
+
+  typedef enum {
+    CUBLAS_DIRECT_FORWARD = 0,
+    CUBLAS_DIRECT_BACKWARD = 1
+  } cusolverDirectMode_t;
+
+  cusolverStatus_t CUSOLVERAPI
+    cusolverGetProperty(libraryPropertyType type, int *value);
+
+  cusolverStatus_t CUSOLVERAPI cusolverGetVersion(int *version);
+
+  #if defined(__cplusplus)
+}
+  #endif /* __cplusplus */
+
+#endif // CUSOLVER_COMMON_H_
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusolver/lib/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cusolver/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusolver/lib/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cusolver/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..96b7908a95426951b4bb42ae4e290ae53c23cf4c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cusolver/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusparse/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cusparse/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusparse/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cusparse/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ebd7f57ee86fa05f2a2dda7eed68f47a29e8e1c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cusparse/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..76c0bef2d86f98a9b013e85e8dce5eef77421e22
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h b/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h
new file mode 100644
index 0000000000000000000000000000000000000000..8ad24a1ead7943e333919affd5f506ed70f05aea
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse.h
@@ -0,0 +1,6106 @@
+/*
+ * Copyright 1993-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(CUSPARSE_H_)
+#define CUSPARSE_H_
+
+#include <cuComplex.h>        // cuComplex
+#include <cuda_runtime_api.h> // cudaStream_t
+#include <library_types.h>    // CUDA_R_32F
+#include <stdint.h>           // int64_t
+#include <stdio.h>            // FILE*
+
+#if defined(__cplusplus)
+#   include <cuda_fp16.h>     // __half
+#endif // defined(__cplusplus)
+
+//##############################################################################
+//# CUSPARSE VERSION INFORMATION
+//##############################################################################
+
+#define CUSPARSE_VER_MAJOR 12
+#define CUSPARSE_VER_MINOR 3
+#define CUSPARSE_VER_PATCH 1
+#define CUSPARSE_VER_BUILD 170
+#define CUSPARSE_VERSION (CUSPARSE_VER_MAJOR * 1000 + \
+                          CUSPARSE_VER_MINOR *  100 + \
+                          CUSPARSE_VER_PATCH)
+
+// #############################################################################
+// # BASIC MACROS
+// #############################################################################
+
+#if !defined(CUSPARSEAPI)
+#    if defined(_WIN32)
+#        define CUSPARSEAPI __stdcall
+#    else
+#        define CUSPARSEAPI
+#    endif
+#endif
+
+//------------------------------------------------------------------------------
+
+#if !defined(_MSC_VER)
+#   define CUSPARSE_CPP_VERSION __cplusplus
+#elif _MSC_FULL_VER >= 190024210 // Visual Studio 2015 Update 3
+#   define CUSPARSE_CPP_VERSION _MSVC_LANG
+#else
+#   define CUSPARSE_CPP_VERSION 0
+#endif
+
+// #############################################################################
+// # CUSPARSE_DEPRECATED MACRO
+// #############################################################################
+
+#if !defined(DISABLE_CUSPARSE_DEPRECATED)
+
+#   if CUSPARSE_CPP_VERSION >= 201402L
+
+#       define CUSPARSE_DEPRECATED_REPLACE_WITH(new_func)                      \
+            [[deprecated("please use " #new_func " instead")]]
+
+#       define CUSPARSE_DEPRECATED                                             \
+         [[deprecated("The routine will be removed in the next major release")]]
+
+#       define CUSPARSE_DEPRECATED_TYPE                                        \
+         [[deprecated("The type will be removed in the next major release")]]
+
+#       define CUSPARSE_DEPRECATED_TYPE_MSVC
+
+#   elif defined(_MSC_VER)
+
+#       define CUSPARSE_DEPRECATED_REPLACE_WITH(new_func)                      \
+            __declspec(deprecated("please use " #new_func " instead"))
+
+#       define CUSPARSE_DEPRECATED                                             \
+            __declspec(deprecated(                                             \
+                "The routine will be removed in the next major release"))
+
+#       define CUSPARSE_DEPRECATED_TYPE
+
+#       define CUSPARSE_DEPRECATED_TYPE_MSVC
+            __declspec(deprecated(                                             \
+                "The type will be removed in the next major release"))
+
+#   elif defined(__INTEL_COMPILER) || defined(__clang__) ||                    \
+         (defined(__GNUC__) &&                                                 \
+          (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)))
+
+#       define CUSPARSE_DEPRECATED_REPLACE_WITH(new_func)                      \
+            __attribute__((deprecated("please use " #new_func " instead")))
+
+#       define CUSPARSE_DEPRECATED                                             \
+            __attribute__((deprecated(                                         \
+                "The routine will be removed in the next major release")))
+
+#       define CUSPARSE_DEPRECATED_TYPE                                        \
+            __attribute__((deprecated(                                         \
+                "The type will be removed in the next major release")))
+
+#       define CUSPARSE_DEPRECATED_TYPE_MSVC
+
+#   elif defined(__GNUC__) || defined(__xlc__)
+
+#       define CUSPARSE_DEPRECATED_REPLACE_WITH(new_func)                      \
+            __attribute__((deprecated))
+
+#       define CUSPARSE_DEPRECATED      __attribute__((deprecated))
+#       define CUSPARSE_DEPRECATED_TYPE __attribute__((deprecated))
+#       define CUSPARSE_DEPRECATED_TYPE_MSVC
+
+#   else
+
+#       define CUSPARSE_DEPRECATED_REPLACE_WITH(new_func)
+#       define CUSPARSE_DEPRECATED
+#       define CUSPARSE_DEPRECATED_TYPE
+#       define CUSPARSE_DEPRECATED_TYPE_MSVC
+
+#   endif // defined(__cplusplus) && __cplusplus >= 201402L
+//------------------------------------------------------------------------------
+
+#   if CUSPARSE_CPP_VERSION >= 201703L
+
+#       define CUSPARSE_DEPRECATED_ENUM_REPLACE_WITH(new_enum)                 \
+            [[deprecated("please use " #new_enum " instead")]]
+
+#       define CUSPARSE_DEPRECATED_ENUM                                        \
+            [[deprecated("The enum will be removed in the next major release")]]
+
+#   elif defined(__clang__) ||                                                 \
+         (defined(__GNUC__) && __GNUC__ >= 6 && !defined(__PGI))
+
+#       define CUSPARSE_DEPRECATED_ENUM_REPLACE_WITH(new_enum)                 \
+            __attribute__((deprecated("please use " #new_enum " instead")))
+
+#       define CUSPARSE_DEPRECATED_ENUM                                        \
+            __attribute__((deprecated(                                         \
+                "The enum will be removed in the next major release")))
+
+#   else
+
+#       define CUSPARSE_DEPRECATED_ENUM_REPLACE_WITH(new_enum)
+#       define CUSPARSE_DEPRECATED_ENUM
+
+#   endif // defined(__cplusplus) && __cplusplus >= 201402L
+
+#else // defined(DISABLE_CUSPARSE_DEPRECATED)
+
+#   define CUSPARSE_DEPRECATED_REPLACE_WITH(new_func)
+#   define CUSPARSE_DEPRECATED
+#   define CUSPARSE_DEPRECATED_TYPE
+#   define CUSPARSE_DEPRECATED_TYPE_MSVC
+#   define CUSPARSE_DEPRECATED_ENUM_REPLACE_WITH(new_enum)
+#   define CUSPARSE_DEPRECATED_ENUM
+
+#endif // !defined(DISABLE_CUSPARSE_DEPRECATED)
+
+#undef CUSPARSE_CPP_VERSION
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus)
+extern "C" {
+#endif // defined(__cplusplus)
+
+//##############################################################################
+//# OPAQUE DATA STRUCTURES
+//##############################################################################
+
+struct cusparseContext;
+typedef struct cusparseContext* cusparseHandle_t;
+
+struct cusparseMatDescr;
+typedef struct cusparseMatDescr* cusparseMatDescr_t;
+
+struct bsrsv2Info;
+typedef CUSPARSE_DEPRECATED_TYPE_MSVC
+struct bsrsv2Info* bsrsv2Info_t CUSPARSE_DEPRECATED_TYPE;
+
+struct bsrsm2Info;
+typedef CUSPARSE_DEPRECATED_TYPE_MSVC
+struct bsrsm2Info* bsrsm2Info_t CUSPARSE_DEPRECATED_TYPE;
+
+struct csric02Info;
+typedef CUSPARSE_DEPRECATED_TYPE_MSVC
+struct csric02Info* csric02Info_t CUSPARSE_DEPRECATED_TYPE;
+
+struct bsric02Info;
+typedef CUSPARSE_DEPRECATED_TYPE_MSVC
+struct bsric02Info* bsric02Info_t CUSPARSE_DEPRECATED_TYPE;
+
+struct csrilu02Info;
+typedef CUSPARSE_DEPRECATED_TYPE_MSVC
+struct csrilu02Info* csrilu02Info_t CUSPARSE_DEPRECATED_TYPE;
+
+struct bsrilu02Info;
+typedef CUSPARSE_DEPRECATED_TYPE_MSVC
+struct bsrilu02Info* bsrilu02Info_t CUSPARSE_DEPRECATED_TYPE;
+
+struct csru2csrInfo;
+typedef CUSPARSE_DEPRECATED_TYPE_MSVC
+struct csru2csrInfo* csru2csrInfo_t CUSPARSE_DEPRECATED_TYPE;
+
+struct cusparseColorInfo;
+typedef CUSPARSE_DEPRECATED_TYPE_MSVC
+struct cusparseColorInfo* cusparseColorInfo_t CUSPARSE_DEPRECATED_TYPE;
+
+struct pruneInfo;
+typedef CUSPARSE_DEPRECATED_TYPE_MSVC
+struct pruneInfo* pruneInfo_t CUSPARSE_DEPRECATED_TYPE;
+
+//##############################################################################
+//# ENUMERATORS
+//##############################################################################
+
+typedef enum {
+    CUSPARSE_STATUS_SUCCESS                   = 0,
+    CUSPARSE_STATUS_NOT_INITIALIZED           = 1,
+    CUSPARSE_STATUS_ALLOC_FAILED              = 2,
+    CUSPARSE_STATUS_INVALID_VALUE             = 3,
+    CUSPARSE_STATUS_ARCH_MISMATCH             = 4,
+    CUSPARSE_STATUS_MAPPING_ERROR             = 5,
+    CUSPARSE_STATUS_EXECUTION_FAILED          = 6,
+    CUSPARSE_STATUS_INTERNAL_ERROR            = 7,
+    CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED = 8,
+    CUSPARSE_STATUS_ZERO_PIVOT                = 9,
+    CUSPARSE_STATUS_NOT_SUPPORTED             = 10,
+    CUSPARSE_STATUS_INSUFFICIENT_RESOURCES    = 11
+} cusparseStatus_t;
+
+typedef enum {
+    CUSPARSE_POINTER_MODE_HOST   = 0,
+    CUSPARSE_POINTER_MODE_DEVICE = 1
+} cusparsePointerMode_t;
+
+typedef enum {
+    CUSPARSE_ACTION_SYMBOLIC = 0,
+    CUSPARSE_ACTION_NUMERIC  = 1
+} cusparseAction_t;
+
+typedef enum {
+    CUSPARSE_MATRIX_TYPE_GENERAL    = 0,
+    CUSPARSE_MATRIX_TYPE_SYMMETRIC  = 1,
+    CUSPARSE_MATRIX_TYPE_HERMITIAN  = 2,
+    CUSPARSE_MATRIX_TYPE_TRIANGULAR = 3
+} cusparseMatrixType_t;
+
+typedef enum {
+    CUSPARSE_FILL_MODE_LOWER = 0,
+    CUSPARSE_FILL_MODE_UPPER = 1
+} cusparseFillMode_t;
+
+typedef enum {
+    CUSPARSE_DIAG_TYPE_NON_UNIT = 0,
+    CUSPARSE_DIAG_TYPE_UNIT     = 1
+} cusparseDiagType_t;
+
+typedef enum {
+    CUSPARSE_INDEX_BASE_ZERO = 0,
+    CUSPARSE_INDEX_BASE_ONE  = 1
+} cusparseIndexBase_t;
+
+typedef enum {
+    CUSPARSE_OPERATION_NON_TRANSPOSE       = 0,
+    CUSPARSE_OPERATION_TRANSPOSE           = 1,
+    CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2
+} cusparseOperation_t;
+
+typedef enum {
+    CUSPARSE_DIRECTION_ROW    = 0,
+    CUSPARSE_DIRECTION_COLUMN = 1
+} cusparseDirection_t;
+
+typedef enum {
+    CUSPARSE_SOLVE_POLICY_NO_LEVEL = 0,
+    CUSPARSE_SOLVE_POLICY_USE_LEVEL = 1
+} cusparseSolvePolicy_t CUSPARSE_DEPRECATED_TYPE;
+
+typedef enum {
+    CUSPARSE_COLOR_ALG0 = 0, // default
+    CUSPARSE_COLOR_ALG1 = 1
+} cusparseColorAlg_t CUSPARSE_DEPRECATED_TYPE;
+
+//##############################################################################
+//# INITIALIZATION AND MANAGEMENT ROUTINES
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreate(cusparseHandle_t* handle);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroy(cusparseHandle_t handle);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetVersion(cusparseHandle_t handle,
+                   int*             version);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetProperty(libraryPropertyType type,
+                    int*                value);
+
+const char* CUSPARSEAPI
+cusparseGetErrorName(cusparseStatus_t status);
+
+const char* CUSPARSEAPI
+cusparseGetErrorString(cusparseStatus_t status);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetStream(cusparseHandle_t handle,
+                  cudaStream_t     streamId);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetStream(cusparseHandle_t handle,
+                  cudaStream_t*    streamId);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGetPointerMode(cusparseHandle_t       handle,
+                       cusparsePointerMode_t* mode);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetPointerMode(cusparseHandle_t      handle,
+                       cusparsePointerMode_t mode);
+
+//##############################################################################
+//# LOGGING APIs
+//##############################################################################
+
+typedef void (*cusparseLoggerCallback_t)(int         logLevel,
+                                         const char* functionName,
+                                         const char* message);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerSetCallback(cusparseLoggerCallback_t callback);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerSetFile(FILE* file);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerOpenFile(const char* logFile);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerSetLevel(int level);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerSetMask(int mask);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseLoggerForceDisable(void);
+
+//##############################################################################
+//# HELPER ROUTINES
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateMatDescr(cusparseMatDescr_t* descrA);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyMatDescr(cusparseMatDescr_t descrA);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatType(cusparseMatDescr_t   descrA,
+                   cusparseMatrixType_t type);
+
+cusparseMatrixType_t CUSPARSEAPI
+cusparseGetMatType(const cusparseMatDescr_t descrA);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatFillMode(cusparseMatDescr_t descrA,
+                       cusparseFillMode_t fillMode);
+
+cusparseFillMode_t CUSPARSEAPI
+cusparseGetMatFillMode(const cusparseMatDescr_t descrA);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatDiagType(cusparseMatDescr_t descrA,
+                       cusparseDiagType_t diagType);
+
+cusparseDiagType_t CUSPARSEAPI
+cusparseGetMatDiagType(const cusparseMatDescr_t descrA);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSetMatIndexBase(cusparseMatDescr_t  descrA,
+                        cusparseIndexBase_t base);
+
+cusparseIndexBase_t CUSPARSEAPI
+cusparseGetMatIndexBase(const cusparseMatDescr_t descrA);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsric02Info(csric02Info_t* info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsric02Info(csric02Info_t info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsric02Info(bsric02Info_t* info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyBsric02Info(bsric02Info_t info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsrilu02Info(csrilu02Info_t* info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsrilu02Info(csrilu02Info_t info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsrilu02Info(bsrilu02Info_t* info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyBsrilu02Info(bsrilu02Info_t info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsrsv2Info(bsrsv2Info_t* info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyBsrsv2Info(bsrsv2Info_t info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsrsm2Info(bsrsm2Info_t* info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyBsrsm2Info(bsrsm2Info_t info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsru2csrInfo(csru2csrInfo_t* info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyCsru2csrInfo(csru2csrInfo_t info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateColorInfo(cusparseColorInfo_t* info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyColorInfo(cusparseColorInfo_t info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCreatePruneInfo(pruneInfo_t* info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyPruneInfo(pruneInfo_t info);
+
+//##############################################################################
+//# SPARSE LEVEL 2 ROUTINES
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi(cusparseHandle_t    handle,
+               cusparseOperation_t transA,
+               int                 m,
+               int                 n,
+               const float*        alpha,
+               const float*        A,
+               int                 lda,
+               int                 nnz,
+               const float*        xVal,
+               const int*          xInd,
+               const float*        beta,
+               float*              y,
+               cusparseIndexBase_t idxBase,
+               void*               pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgemvi_bufferSize(cusparseHandle_t    handle,
+                          cusparseOperation_t transA,
+                          int                 m,
+                          int                 n,
+                          int                 nnz,
+                          int*                pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi(cusparseHandle_t    handle,
+               cusparseOperation_t transA,
+               int                 m,
+               int                 n,
+               const double*       alpha,
+               const double*       A,
+               int                 lda,
+               int                 nnz,
+               const double*       xVal,
+               const int*          xInd,
+               const double*       beta,
+               double*             y,
+               cusparseIndexBase_t idxBase,
+               void*               pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgemvi_bufferSize(cusparseHandle_t    handle,
+                          cusparseOperation_t transA,
+                          int                 m,
+                          int                 n,
+                          int                 nnz,
+                          int*                pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgemvi(cusparseHandle_t    handle,
+               cusparseOperation_t transA,
+               int                 m,
+               int                 n,
+               const cuComplex*    alpha,
+               const cuComplex*    A,
+               int                 lda,
+               int                 nnz,
+               const cuComplex*    xVal,
+               const int*          xInd,
+               const cuComplex*    beta,
+               cuComplex*          y,
+               cusparseIndexBase_t idxBase,
+               void*               pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgemvi_bufferSize(cusparseHandle_t    handle,
+                          cusparseOperation_t transA,
+                          int                 m,
+                          int                 n,
+                          int                 nnz,
+                          int*                pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemvi(cusparseHandle_t       handle,
+               cusparseOperation_t    transA,
+               int                    m,
+               int                    n,
+               const cuDoubleComplex* alpha,
+               const cuDoubleComplex* A,
+               int                    lda,
+               int                    nnz,
+               const cuDoubleComplex* xVal,
+               const int*             xInd,
+               const cuDoubleComplex* beta,
+               cuDoubleComplex*       y,
+               cusparseIndexBase_t    idxBase,
+               void*                  pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgemvi_bufferSize(cusparseHandle_t    handle,
+                          cusparseOperation_t transA,
+                          int                 m,
+                          int                 n,
+                          int                 nnz,
+                          int*                pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrmv(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               int                      mb,
+               int                      nb,
+               int                      nnzb,
+               const float*             alpha,
+               const cusparseMatDescr_t descrA,
+               const float*             bsrSortedValA,
+               const int*               bsrSortedRowPtrA,
+               const int*               bsrSortedColIndA,
+               int                      blockDim,
+               const float*             x,
+               const float*             beta,
+               float*                   y);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrmv(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               int                      mb,
+               int                      nb,
+               int                      nnzb,
+               const double*            alpha,
+               const cusparseMatDescr_t descrA,
+               const double*            bsrSortedValA,
+               const int*               bsrSortedRowPtrA,
+               const int*               bsrSortedColIndA,
+               int                      blockDim,
+               const double*            x,
+               const double*            beta,
+               double*                  y);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrmv(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               int                      mb,
+               int                      nb,
+               int                      nnzb,
+               const cuComplex*         alpha,
+               const cusparseMatDescr_t descrA,
+               const cuComplex*         bsrSortedValA,
+               const int*               bsrSortedRowPtrA,
+               const int*               bsrSortedColIndA,
+               int                      blockDim,
+               const cuComplex*         x,
+               const cuComplex*         beta,
+               cuComplex*               y);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrmv(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               int                      mb,
+               int                      nb,
+               int                      nnzb,
+               const cuDoubleComplex*   alpha,
+               const cusparseMatDescr_t descrA,
+               const cuDoubleComplex*   bsrSortedValA,
+               const int*               bsrSortedRowPtrA,
+               const int*               bsrSortedColIndA,
+               int                      blockDim,
+               const cuDoubleComplex*   x,
+               const cuDoubleComplex*   beta,
+               cuDoubleComplex*         y);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrxmv(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                int                      sizeOfMask,
+                int                      mb,
+                int                      nb,
+                int                      nnzb,
+                const float*             alpha,
+                const cusparseMatDescr_t descrA,
+                const float*             bsrSortedValA,
+                const int*               bsrSortedMaskPtrA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedEndPtrA,
+                const int*               bsrSortedColIndA,
+                int                      blockDim,
+                const float*             x,
+                const float*             beta,
+                float*                   y);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrxmv(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                int                      sizeOfMask,
+                int                      mb,
+                int                      nb,
+                int                      nnzb,
+                const double*            alpha,
+                const cusparseMatDescr_t descrA,
+                const double*            bsrSortedValA,
+                const int*               bsrSortedMaskPtrA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedEndPtrA,
+                const int*               bsrSortedColIndA,
+                int                      blockDim,
+                const double*            x,
+                const double*            beta,
+                double*                  y);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrxmv(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                int                      sizeOfMask,
+                int                      mb,
+                int                      nb,
+                int                      nnzb,
+                const cuComplex*         alpha,
+                const cusparseMatDescr_t descrA,
+                const cuComplex*         bsrSortedValA,
+                const int*               bsrSortedMaskPtrA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedEndPtrA,
+                const int*               bsrSortedColIndA,
+                int                      blockDim,
+                const cuComplex*         x,
+                const cuComplex*         beta,
+                cuComplex*               y);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrxmv(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                int                      sizeOfMask,
+                int                      mb,
+                int                      nb,
+                int                      nnzb,
+                const cuDoubleComplex*   alpha,
+                const cusparseMatDescr_t descrA,
+                const cuDoubleComplex*   bsrSortedValA,
+                const int*               bsrSortedMaskPtrA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedEndPtrA,
+                const int*               bsrSortedColIndA,
+                int                      blockDim,
+                const cuDoubleComplex*   x,
+                const cuDoubleComplex*   beta,
+                cuDoubleComplex*         y);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseXbsrsv2_zeroPivot(cusparseHandle_t handle,
+                          bsrsv2Info_t     info,
+                          int*             position);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           float*                   bsrSortedValA,
+                           const int*               bsrSortedRowPtrA,
+                           const int*               bsrSortedColIndA,
+                           int                      blockDim,
+                           bsrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           double*                  bsrSortedValA,
+                           const int*               bsrSortedRowPtrA,
+                           const int*               bsrSortedColIndA,
+                           int                      blockDim,
+                           bsrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuComplex*               bsrSortedValA,
+                           const int*               bsrSortedRowPtrA,
+                           const int*               bsrSortedColIndA,
+                           int                      blockDim,
+                           bsrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsv2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuDoubleComplex*         bsrSortedValA,
+                           const int*               bsrSortedRowPtrA,
+                           const int*               bsrSortedColIndA,
+                           int                      blockDim,
+                           bsrsv2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              int                      mb,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              float*                   bsrSortedValA,
+                              const int*               bsrSortedRowPtrA,
+                              const int*               bsrSortedColIndA,
+                              int                      blockSize,
+                              bsrsv2Info_t             info,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              int                      mb,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              double*                  bsrSortedValA,
+                              const int*               bsrSortedRowPtrA,
+                              const int*               bsrSortedColIndA,
+                              int                      blockSize,
+                              bsrsv2Info_t             info,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              int                      mb,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              cuComplex*               bsrSortedValA,
+                              const int*               bsrSortedRowPtrA,
+                              const int*               bsrSortedColIndA,
+                              int                      blockSize,
+                              bsrsv2Info_t             info,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsv2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              int                      mb,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              cuDoubleComplex*         bsrSortedValA,
+                              const int*               bsrSortedRowPtrA,
+                              const int*               bsrSortedColIndA,
+                              int                      blockSize,
+                              bsrsv2Info_t             info,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         int                      mb,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const float*             bsrSortedValA,
+                         const int*               bsrSortedRowPtrA,
+                         const int*               bsrSortedColIndA,
+                         int                      blockDim,
+                         bsrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         int                      mb,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const double*            bsrSortedValA,
+                         const int*               bsrSortedRowPtrA,
+                         const int*               bsrSortedColIndA,
+                         int                      blockDim,
+                         bsrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         int                      mb,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const cuComplex*         bsrSortedValA,
+                         const int*               bsrSortedRowPtrA,
+                         const int*               bsrSortedColIndA,
+                         int                      blockDim,
+                         bsrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsv2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         int                      mb,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const cuDoubleComplex*   bsrSortedValA,
+                         const int*               bsrSortedRowPtrA,
+                         const int*               bsrSortedColIndA,
+                         int                      blockDim,
+                         bsrsv2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      int                      mb,
+                      int                      nnzb,
+                      const float*             alpha,
+                      const cusparseMatDescr_t descrA,
+                      const float*             bsrSortedValA,
+                      const int*               bsrSortedRowPtrA,
+                      const int*               bsrSortedColIndA,
+                      int                      blockDim,
+                      bsrsv2Info_t             info,
+                      const float*             f,
+                      float*                   x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      int                      mb,
+                      int                      nnzb,
+                      const double*            alpha,
+                      const cusparseMatDescr_t descrA,
+                      const double*            bsrSortedValA,
+                      const int*               bsrSortedRowPtrA,
+                      const int*               bsrSortedColIndA,
+                      int                      blockDim,
+                      bsrsv2Info_t             info,
+                      const double*            f,
+                      double*                  x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      int                      mb,
+                      int                      nnzb,
+                      const cuComplex*         alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuComplex*         bsrSortedValA,
+                      const int*               bsrSortedRowPtrA,
+                      const int*               bsrSortedColIndA,
+                      int                      blockDim,
+                      bsrsv2Info_t             info,
+                      const cuComplex*         f,
+                      cuComplex*               x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsv2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      int                      mb,
+                      int                      nnzb,
+                      const cuDoubleComplex*   alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuDoubleComplex*   bsrSortedValA,
+                      const int*               bsrSortedRowPtrA,
+                      const int*               bsrSortedColIndA,
+                      int                      blockDim,
+                      bsrsv2Info_t             info,
+                      const cuDoubleComplex*   f,
+                      cuDoubleComplex*         x,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+//##############################################################################
+//# SPARSE LEVEL 3 ROUTINES
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrmm(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               cusparseOperation_t      transB,
+               int                      mb,
+               int                      n,
+               int                      kb,
+               int                      nnzb,
+               const float*             alpha,
+               const cusparseMatDescr_t descrA,
+               const float* bsrSortedValA,
+               const int*   bsrSortedRowPtrA,
+               const int*   bsrSortedColIndA,
+               const int    blockSize,
+               const float* B,
+               const int    ldb,
+               const float* beta,
+               float*       C,
+               int          ldc);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrmm(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               cusparseOperation_t      transB,
+               int                      mb,
+               int                      n,
+               int                      kb,
+               int                      nnzb,
+               const double*            alpha,
+               const cusparseMatDescr_t descrA,
+               const double* bsrSortedValA,
+               const int*    bsrSortedRowPtrA,
+               const int*    bsrSortedColIndA,
+               const int     blockSize,
+               const double* B,
+               const int     ldb,
+               const double* beta,
+               double*       C,
+               int           ldc);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrmm(cusparseHandle_t         handle,
+               cusparseDirection_t      dirA,
+               cusparseOperation_t      transA,
+               cusparseOperation_t      transB,
+               int                      mb,
+               int                      n,
+               int                      kb,
+               int                      nnzb,
+               const cuComplex*         alpha,
+               const cusparseMatDescr_t descrA,
+               const cuComplex* bsrSortedValA,
+               const int*       bsrSortedRowPtrA,
+               const int*       bsrSortedColIndA,
+               const int        blockSize,
+               const cuComplex* B,
+               const int        ldb,
+               const cuComplex* beta,
+               cuComplex*       C,
+               int              ldc);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrmm(cusparseHandle_t         handle,
+                cusparseDirection_t      dirA,
+                cusparseOperation_t      transA,
+                cusparseOperation_t      transB,
+                int                      mb,
+                int                      n,
+                int                      kb,
+                int                      nnzb,
+                const cuDoubleComplex*   alpha,
+                const cusparseMatDescr_t descrA,
+                const cuDoubleComplex*   bsrSortedValA,
+                const int*               bsrSortedRowPtrA,
+                const int*               bsrSortedColIndA,
+                const int                blockSize,
+                const cuDoubleComplex*   B,
+                const int                ldb,
+                const cuDoubleComplex*   beta,
+                cuDoubleComplex*         C,
+                int                      ldc);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseXbsrsm2_zeroPivot(cusparseHandle_t handle,
+                          bsrsm2Info_t     info,
+                          int*             position);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsm2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           cusparseOperation_t      transXY,
+                           int                      mb,
+                           int                      n,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           float*                   bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockSize,
+                           bsrsm2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsm2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           cusparseOperation_t      transXY,
+                           int                      mb,
+                           int                      n,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           double*                  bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockSize,
+                           bsrsm2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsm2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           cusparseOperation_t      transXY,
+                           int                      mb,
+                           int                      n,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuComplex*               bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockSize,
+                           bsrsm2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsm2_bufferSize(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           cusparseOperation_t      transA,
+                           cusparseOperation_t      transXY,
+                           int                      mb,
+                           int                      n,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuDoubleComplex*         bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockSize,
+                           bsrsm2Info_t             info,
+                           int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      mb,
+                              int                      n,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              float*                   bsrSortedVal,
+                              const int*               bsrSortedRowPtr,
+                              const int*               bsrSortedColInd,
+                              int                      blockSize,
+                              bsrsm2Info_t             info,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      mb,
+                              int                      n,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              double*                  bsrSortedVal,
+                              const int*               bsrSortedRowPtr,
+                              const int*               bsrSortedColInd,
+                              int                      blockSize,
+                              bsrsm2Info_t             info,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      mb,
+                              int                      n,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              cuComplex*               bsrSortedVal,
+                              const int*               bsrSortedRowPtr,
+                              const int*               bsrSortedColInd,
+                              int                      blockSize,
+                              bsrsm2Info_t             info,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsm2_bufferSizeExt(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              cusparseOperation_t      transA,
+                              cusparseOperation_t      transB,
+                              int                      mb,
+                              int                      n,
+                              int                      nnzb,
+                              const cusparseMatDescr_t descrA,
+                              cuDoubleComplex*         bsrSortedVal,
+                              const int*               bsrSortedRowPtr,
+                              const int*               bsrSortedColInd,
+                              int                      blockSize,
+                              bsrsm2Info_t             info,
+                              size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsm2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transXY,
+                         int                      mb,
+                         int                      n,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const float*             bsrSortedVal,
+                         const int*               bsrSortedRowPtr,
+                         const int*               bsrSortedColInd,
+                         int                      blockSize,
+                         bsrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsm2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transXY,
+                         int                      mb,
+                         int                      n,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const double*            bsrSortedVal,
+                         const int*               bsrSortedRowPtr,
+                         const int*               bsrSortedColInd,
+                         int                      blockSize,
+                         bsrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsm2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transXY,
+                         int                      mb,
+                         int                      n,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const cuComplex*         bsrSortedVal,
+                         const int*               bsrSortedRowPtr,
+                         const int*               bsrSortedColInd,
+                         int                      blockSize,
+                         bsrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsm2_analysis(cusparseHandle_t         handle,
+                         cusparseDirection_t      dirA,
+                         cusparseOperation_t      transA,
+                         cusparseOperation_t      transXY,
+                         int                      mb,
+                         int                      n,
+                         int                      nnzb,
+                         const cusparseMatDescr_t descrA,
+                         const cuDoubleComplex*   bsrSortedVal,
+                         const int*               bsrSortedRowPtr,
+                         const int*               bsrSortedColInd,
+                         int                      blockSize,
+                         bsrsm2Info_t             info,
+                         cusparseSolvePolicy_t    policy,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrsm2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transXY,
+                      int                      mb,
+                      int                      n,
+                      int                      nnzb,
+                      const float*             alpha,
+                      const cusparseMatDescr_t descrA,
+                      const float*             bsrSortedVal,
+                      const int*               bsrSortedRowPtr,
+                      const int*               bsrSortedColInd,
+                      int                      blockSize,
+                      bsrsm2Info_t             info,
+                      const float*             B,
+                      int                      ldb,
+                      float*                   X,
+                      int                      ldx,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrsm2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transXY,
+                      int                      mb,
+                      int                      n,
+                      int                      nnzb,
+                      const double*            alpha,
+                      const cusparseMatDescr_t descrA,
+                      const double*            bsrSortedVal,
+                      const int*               bsrSortedRowPtr,
+                      const int*               bsrSortedColInd,
+                      int                      blockSize,
+                      bsrsm2Info_t             info,
+                      const double*            B,
+                      int                      ldb,
+                      double*                  X,
+                      int                      ldx,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrsm2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transXY,
+                      int                      mb,
+                      int                      n,
+                      int                      nnzb,
+                      const cuComplex*         alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuComplex*         bsrSortedVal,
+                      const int*               bsrSortedRowPtr,
+                      const int*               bsrSortedColInd,
+                      int                      blockSize,
+                      bsrsm2Info_t             info,
+                      const cuComplex*         B,
+                      int                      ldb,
+                      cuComplex*               X,
+                      int                      ldx,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrsm2_solve(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      cusparseOperation_t      transA,
+                      cusparseOperation_t      transXY,
+                      int                      mb,
+                      int                      n,
+                      int                      nnzb,
+                      const cuDoubleComplex*   alpha,
+                      const cusparseMatDescr_t descrA,
+                      const cuDoubleComplex*   bsrSortedVal,
+                      const int*               bsrSortedRowPtr,
+                      const int*               bsrSortedColInd,
+                      int                      blockSize,
+                      bsrsm2Info_t             info,
+                      const cuDoubleComplex*   B,
+                      int                      ldb,
+                      cuDoubleComplex*         X,
+                      int                      ldx,
+                      cusparseSolvePolicy_t    policy,
+                      void*                    pBuffer);
+
+//##############################################################################
+//# PRECONDITIONERS
+//##############################################################################
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02_numericBoost(cusparseHandle_t handle,
+                               csrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               float*           boost_val);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02_numericBoost(cusparseHandle_t handle,
+                               csrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               double*          boost_val);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02_numericBoost(cusparseHandle_t handle,
+                               csrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               cuComplex*       boost_val);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02_numericBoost(cusparseHandle_t handle,
+                               csrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               cuDoubleComplex* boost_val);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrilu02_zeroPivot(cusparseHandle_t handle,
+                            csrilu02Info_t   info,
+                            int*             position);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02_bufferSize(cusparseHandle_t         handle,
+                             int                      m,
+                             int                      nnz,
+                             const cusparseMatDescr_t descrA,
+                             float*                   csrSortedValA,
+                             const int*               csrSortedRowPtrA,
+                             const int*               csrSortedColIndA,
+                             csrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02_bufferSize(cusparseHandle_t         handle,
+                             int                      m,
+                             int                      nnz,
+                             const cusparseMatDescr_t descrA,
+                             double*                  csrSortedValA,
+                             const int*               csrSortedRowPtrA,
+                             const int*               csrSortedColIndA,
+                             csrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02_bufferSize(cusparseHandle_t         handle,
+                             int                      m,
+                             int                      nnz,
+                             const cusparseMatDescr_t descrA,
+                             cuComplex*               csrSortedValA,
+                             const int*               csrSortedRowPtrA,
+                             const int*               csrSortedColIndA,
+                             csrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02_bufferSize(cusparseHandle_t         handle,
+                             int                      m,
+                             int                      nnz,
+                             const cusparseMatDescr_t descrA,
+                             cuDoubleComplex*         csrSortedValA,
+                             const int*               csrSortedRowPtrA,
+                             const int*               csrSortedColIndA,
+                             csrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      nnz,
+                                const cusparseMatDescr_t descrA,
+                                float*                   csrSortedVal,
+                                const int*               csrSortedRowPtr,
+                                const int*               csrSortedColInd,
+                                csrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      nnz,
+                                const cusparseMatDescr_t descrA,
+                                double*                  csrSortedVal,
+                                const int*               csrSortedRowPtr,
+                                const int*               csrSortedColInd,
+                                csrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      nnz,
+                                const cusparseMatDescr_t descrA,
+                                cuComplex*               csrSortedVal,
+                                const int*               csrSortedRowPtr,
+                                const int*               csrSortedColInd,
+                                csrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      nnz,
+                                const cusparseMatDescr_t descrA,
+                                cuDoubleComplex*         csrSortedVal,
+                                const int*               csrSortedRowPtr,
+                                const int*               csrSortedColInd,
+                                csrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02_analysis(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           const float*             csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02_analysis(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           const double*            csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02_analysis(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           const cuComplex*         csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02_analysis(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      nnz,
+                           const cusparseMatDescr_t descrA,
+                           const cuDoubleComplex*   csrSortedValA,
+                           const int*               csrSortedRowPtrA,
+                           const int*               csrSortedColIndA,
+                           csrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrilu02(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  float*                   csrSortedValA_valM,
+                  const int*            csrSortedRowPtrA,
+                  const int*            csrSortedColIndA,
+                  csrilu02Info_t        info,
+                  cusparseSolvePolicy_t policy,
+                  void*                 pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrilu02(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  double*                  csrSortedValA_valM,
+                  const int*            csrSortedRowPtrA,
+                  const int*            csrSortedColIndA,
+                  csrilu02Info_t        info,
+                  cusparseSolvePolicy_t policy,
+                  void*                 pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrilu02(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuComplex*               csrSortedValA_valM,
+                  const int*            csrSortedRowPtrA,
+                  const int*            csrSortedColIndA,
+                  csrilu02Info_t        info,
+                  cusparseSolvePolicy_t policy,
+                  void*                 pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrilu02(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuDoubleComplex*         csrSortedValA_valM,
+                  const int*            csrSortedRowPtrA,
+                  const int*            csrSortedColIndA,
+                  csrilu02Info_t        info,
+                  cusparseSolvePolicy_t policy,
+                  void*                 pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02_numericBoost(cusparseHandle_t handle,
+                               bsrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               float*           boost_val);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02_numericBoost(cusparseHandle_t handle,
+                               bsrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               double*          boost_val);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02_numericBoost(cusparseHandle_t handle,
+                               bsrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               cuComplex*       boost_val);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02_numericBoost(cusparseHandle_t handle,
+                               bsrilu02Info_t   info,
+                               int              enable_boost,
+                               double*          tol,
+                               cuDoubleComplex* boost_val);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseXbsrilu02_zeroPivot(cusparseHandle_t handle,
+                            bsrilu02Info_t   info,
+                            int*             position);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02_bufferSize(cusparseHandle_t         handle,
+                             cusparseDirection_t      dirA,
+                             int                      mb,
+                             int                      nnzb,
+                             const cusparseMatDescr_t descrA,
+                             float*                   bsrSortedVal,
+                             const int*               bsrSortedRowPtr,
+                             const int*               bsrSortedColInd,
+                             int                      blockDim,
+                             bsrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02_bufferSize(cusparseHandle_t         handle,
+                             cusparseDirection_t      dirA,
+                             int                      mb,
+                             int                      nnzb,
+                             const cusparseMatDescr_t descrA,
+                             double*                  bsrSortedVal,
+                             const int*               bsrSortedRowPtr,
+                             const int*               bsrSortedColInd,
+                             int                      blockDim,
+                             bsrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02_bufferSize(cusparseHandle_t         handle,
+                             cusparseDirection_t      dirA,
+                             int                      mb,
+                             int                      nnzb,
+                             const cusparseMatDescr_t descrA,
+                             cuComplex*               bsrSortedVal,
+                             const int*               bsrSortedRowPtr,
+                             const int*               bsrSortedColInd,
+                             int                      blockDim,
+                             bsrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02_bufferSize(cusparseHandle_t         handle,
+                             cusparseDirection_t      dirA,
+                             int                      mb,
+                             int                      nnzb,
+                             const cusparseMatDescr_t descrA,
+                             cuDoubleComplex*         bsrSortedVal,
+                             const int*               bsrSortedRowPtr,
+                             const int*               bsrSortedColInd,
+                             int                      blockDim,
+                             bsrilu02Info_t           info,
+                             int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                float*                   bsrSortedVal,
+                                const int*               bsrSortedRowPtr,
+                                const int*               bsrSortedColInd,
+                                int                      blockSize,
+                                bsrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                double*                  bsrSortedVal,
+                                const int*               bsrSortedRowPtr,
+                                const int*               bsrSortedColInd,
+                                int                      blockSize,
+                                bsrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                cuComplex*               bsrSortedVal,
+                                const int*               bsrSortedRowPtr,
+                                const int*               bsrSortedColInd,
+                                int                      blockSize,
+                                bsrilu02Info_t           info,
+                                size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               cuDoubleComplex*         bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsrilu02Info_t           info,
+                               size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02_analysis(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           float*                   bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockDim,
+                           bsrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02_analysis(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           double*                  bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockDim,
+                           bsrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02_analysis(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuComplex*               bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockDim,
+                           bsrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02_analysis(cusparseHandle_t         handle,
+                           cusparseDirection_t      dirA,
+                           int                      mb,
+                           int                      nnzb,
+                           const cusparseMatDescr_t descrA,
+                           cuDoubleComplex*         bsrSortedVal,
+                           const int*               bsrSortedRowPtr,
+                           const int*               bsrSortedColInd,
+                           int                      blockDim,
+                           bsrilu02Info_t           info,
+                           cusparseSolvePolicy_t    policy,
+                           void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsrilu02(cusparseHandle_t         handle,
+                  cusparseDirection_t      dirA,
+                  int                      mb,
+                  int                      nnzb,
+                  const cusparseMatDescr_t descrA,
+                  float*                   bsrSortedVal,
+                  const int*               bsrSortedRowPtr,
+                  const int*               bsrSortedColInd,
+                  int                      blockDim,
+                  bsrilu02Info_t           info,
+                  cusparseSolvePolicy_t    policy,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsrilu02(cusparseHandle_t         handle,
+                  cusparseDirection_t      dirA,
+                  int                      mb,
+                  int                      nnzb,
+                  const cusparseMatDescr_t descrA,
+                  double*                  bsrSortedVal,
+                  const int*               bsrSortedRowPtr,
+                  const int*               bsrSortedColInd,
+                  int                      blockDim,
+                  bsrilu02Info_t           info,
+                  cusparseSolvePolicy_t    policy,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsrilu02(cusparseHandle_t         handle,
+                  cusparseDirection_t      dirA,
+                  int                      mb,
+                  int                      nnzb,
+                  const cusparseMatDescr_t descrA,
+                  cuComplex*               bsrSortedVal,
+                  const int*               bsrSortedRowPtr,
+                  const int*               bsrSortedColInd,
+                  int                      blockDim,
+                  bsrilu02Info_t           info,
+                  cusparseSolvePolicy_t    policy,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsrilu02(cusparseHandle_t         handle,
+                  cusparseDirection_t      dirA,
+                  int                      mb,
+                  int                      nnzb,
+                  const cusparseMatDescr_t descrA,
+                  cuDoubleComplex*         bsrSortedVal,
+                  const int*               bsrSortedRowPtr,
+                  const int*               bsrSortedColInd,
+                  int                      blockDim,
+                  bsrilu02Info_t           info,
+                  cusparseSolvePolicy_t    policy,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsric02_zeroPivot(cusparseHandle_t handle,
+                           csric02Info_t    info,
+                           int*             position);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsric02_bufferSize(cusparseHandle_t         handle,
+                            int                      m,
+                            int                      nnz,
+                            const cusparseMatDescr_t descrA,
+                            float*                   csrSortedValA,
+                            const int*               csrSortedRowPtrA,
+                            const int*               csrSortedColIndA,
+                            csric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsric02_bufferSize(cusparseHandle_t         handle,
+                            int                      m,
+                            int                      nnz,
+                            const cusparseMatDescr_t descrA,
+                            double*                  csrSortedValA,
+                            const int*               csrSortedRowPtrA,
+                            const int*               csrSortedColIndA,
+                            csric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsric02_bufferSize(cusparseHandle_t         handle,
+                            int                      m,
+                            int                      nnz,
+                            const cusparseMatDescr_t descrA,
+                            cuComplex*               csrSortedValA,
+                            const int*               csrSortedRowPtrA,
+                            const int*               csrSortedColIndA,
+                            csric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsric02_bufferSize(cusparseHandle_t         handle,
+                            int                      m,
+                            int                      nnz,
+                            const cusparseMatDescr_t descrA,
+                            cuDoubleComplex*         csrSortedValA,
+                            const int*               csrSortedRowPtrA,
+                            const int*               csrSortedColIndA,
+                            csric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               int                      m,
+                               int                      nnz,
+                               const cusparseMatDescr_t descrA,
+                               float*                   csrSortedVal,
+                               const int*               csrSortedRowPtr,
+                               const int*               csrSortedColInd,
+                               csric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               int                      m,
+                               int                      nnz,
+                               const cusparseMatDescr_t descrA,
+                               double*                  csrSortedVal,
+                               const int*               csrSortedRowPtr,
+                               const int*               csrSortedColInd,
+                               csric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               int                      m,
+                               int                      nnz,
+                               const cusparseMatDescr_t descrA,
+                               cuComplex*               csrSortedVal,
+                               const int*               csrSortedRowPtr,
+                               const int*               csrSortedColInd,
+                               csric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               int                      m,
+                               int                      nnz,
+                               const cusparseMatDescr_t descrA,
+                               cuDoubleComplex*         csrSortedVal,
+                               const int*               csrSortedRowPtr,
+                               const int*               csrSortedColInd,
+                               csric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsric02_analysis(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      nnz,
+                          const cusparseMatDescr_t descrA,
+                          const float*             csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          csric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsric02_analysis(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      nnz,
+                          const cusparseMatDescr_t descrA,
+                          const double*            csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          csric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsric02_analysis(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      nnz,
+                          const cusparseMatDescr_t descrA,
+                          const cuComplex*         csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          csric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsric02_analysis(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      nnz,
+                          const cusparseMatDescr_t descrA,
+                          const cuDoubleComplex*   csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          csric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsric02(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 float*                   csrSortedValA_valM,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 csric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsric02(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 double*                  csrSortedValA_valM,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 csric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsric02(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 cuComplex*               csrSortedValA_valM,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 csric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsric02(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 cuDoubleComplex*         csrSortedValA_valM,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 csric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseXbsric02_zeroPivot(cusparseHandle_t handle,
+                           bsric02Info_t    info,
+                           int*             position);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsric02_bufferSize(cusparseHandle_t         handle,
+                            cusparseDirection_t      dirA,
+                            int                      mb,
+                            int                      nnzb,
+                            const cusparseMatDescr_t descrA,
+                            float*                   bsrSortedVal,
+                            const int*               bsrSortedRowPtr,
+                            const int*               bsrSortedColInd,
+                            int                      blockDim,
+                            bsric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsric02_bufferSize(cusparseHandle_t         handle,
+                            cusparseDirection_t      dirA,
+                            int                      mb,
+                            int                      nnzb,
+                            const cusparseMatDescr_t descrA,
+                            double*                  bsrSortedVal,
+                            const int*               bsrSortedRowPtr,
+                            const int*               bsrSortedColInd,
+                            int                      blockDim,
+                            bsric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsric02_bufferSize(cusparseHandle_t         handle,
+                            cusparseDirection_t      dirA,
+                            int                      mb,
+                            int                      nnzb,
+                            const cusparseMatDescr_t descrA,
+                            cuComplex*               bsrSortedVal,
+                            const int*               bsrSortedRowPtr,
+                            const int*               bsrSortedColInd,
+                            int                      blockDim,
+                            bsric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsric02_bufferSize(cusparseHandle_t         handle,
+                            cusparseDirection_t      dirA,
+                            int                      mb,
+                            int                      nnzb,
+                            const cusparseMatDescr_t descrA,
+                            cuDoubleComplex*         bsrSortedVal,
+                            const int*               bsrSortedRowPtr,
+                            const int*               bsrSortedColInd,
+                            int                      blockDim,
+                            bsric02Info_t            info,
+                            int*                     pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               float*                   bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               double*                  bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               cuComplex*               bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsric02_bufferSizeExt(cusparseHandle_t         handle,
+                               cusparseDirection_t      dirA,
+                               int                      mb,
+                               int                      nnzb,
+                               const cusparseMatDescr_t descrA,
+                               cuDoubleComplex*         bsrSortedVal,
+                               const int*               bsrSortedRowPtr,
+                               const int*               bsrSortedColInd,
+                               int                      blockSize,
+                               bsric02Info_t            info,
+                               size_t*                  pBufferSize);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsric02_analysis(cusparseHandle_t         handle,
+                          cusparseDirection_t      dirA,
+                          int                      mb,
+                          int                      nnzb,
+                          const cusparseMatDescr_t descrA,
+                          const float*             bsrSortedVal,
+                          const int*               bsrSortedRowPtr,
+                          const int*               bsrSortedColInd,
+                          int                      blockDim,
+                          bsric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pInputBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsric02_analysis(cusparseHandle_t         handle,
+                          cusparseDirection_t      dirA,
+                          int                      mb,
+                          int                      nnzb,
+                          const cusparseMatDescr_t descrA,
+                          const double*            bsrSortedVal,
+                          const int*               bsrSortedRowPtr,
+                          const int*               bsrSortedColInd,
+                          int                      blockDim,
+                          bsric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pInputBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsric02_analysis(cusparseHandle_t         handle,
+                          cusparseDirection_t      dirA,
+                          int                      mb,
+                          int                      nnzb,
+                          const cusparseMatDescr_t descrA,
+                          const cuComplex*         bsrSortedVal,
+                          const int*               bsrSortedRowPtr,
+                          const int*               bsrSortedColInd,
+                          int                      blockDim,
+                          bsric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pInputBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsric02_analysis(cusparseHandle_t         handle,
+                          cusparseDirection_t      dirA,
+                          int                      mb,
+                          int                      nnzb,
+                          const cusparseMatDescr_t descrA,
+                          const cuDoubleComplex*   bsrSortedVal,
+                          const int*               bsrSortedRowPtr,
+                          const int*               bsrSortedColInd,
+                          int                      blockDim,
+                          bsric02Info_t            info,
+                          cusparseSolvePolicy_t    policy,
+                          void*                    pInputBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsric02(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nnzb,
+                 const cusparseMatDescr_t descrA,
+                 float*                   bsrSortedVal,
+                 const int*               bsrSortedRowPtr,
+                 const int*               bsrSortedColInd,
+                 int                      blockDim,
+                 bsric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsric02(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nnzb,
+                 const cusparseMatDescr_t descrA,
+                 double*                  bsrSortedVal,
+                 const int*               bsrSortedRowPtr,
+                 const int*               bsrSortedColInd,
+                 int                      blockDim,
+                 bsric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsric02(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nnzb,
+                 const cusparseMatDescr_t descrA,
+                 cuComplex*               bsrSortedVal,
+                 const int*               bsrSortedRowPtr,
+                 const int*
+                      bsrSortedColInd,
+                 int                      blockDim,
+                 bsric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsric02(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nnzb,
+                 const cusparseMatDescr_t descrA,
+                 cuDoubleComplex*         bsrSortedVal,
+                 const int*               bsrSortedRowPtr,
+                 const int*               bsrSortedColInd,
+                 int                      blockDim,
+                 bsric02Info_t            info,
+                 cusparseSolvePolicy_t    policy,
+                 void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2_bufferSizeExt(cusparseHandle_t handle,
+                             int              m,
+                             int              n,
+                             const float*     dl,
+                             const float*     d,
+                             const float*     du,
+                             const float*     B,
+                             int              ldb,
+                             size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2_bufferSizeExt(cusparseHandle_t handle,
+                             int              m,
+                             int              n,
+                             const double*    dl,
+                             const double*    d,
+                             const double*    du,
+                             const double*    B,
+                             int              ldb,
+                             size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2_bufferSizeExt(cusparseHandle_t handle,
+                             int              m,
+                             int              n,
+                             const cuComplex* dl,
+                             const cuComplex* d,
+                             const cuComplex* du,
+                             const cuComplex* B,
+                             int              ldb,
+                             size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2_bufferSizeExt(cusparseHandle_t       handle,
+                             int                    m,
+                             int                    n,
+                             const cuDoubleComplex* dl,
+                             const cuDoubleComplex* d,
+                             const cuDoubleComplex* du,
+                             const cuDoubleComplex* B,
+                             int                    ldb,
+                             size_t*                bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               const float*     dl,
+               const float*     d,
+               const float*     du,
+               float*           B,
+               int              ldb,
+               void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               const double*    dl,
+               const double*    d,
+               const double*    du,
+               double*          B,
+               int              ldb,
+               void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2(cusparseHandle_t handle,
+               int              m,
+               int              n,
+               const cuComplex* dl,
+               const cuComplex* d,
+               const cuComplex* du,
+               cuComplex*       B,
+               int              ldb,
+               void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2(cusparseHandle_t       handle,
+               int                    m,
+               int                    n,
+               const cuDoubleComplex* dl,
+               const cuDoubleComplex* d,
+               const cuDoubleComplex* du,
+               cuDoubleComplex*       B,
+               int                    ldb,
+               void*                  pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle,
+                                     int              m,
+                                     int              n,
+                                     const float*     dl,
+                                     const float*     d,
+                                     const float*     du,
+                                     const float*     B,
+                                     int              ldb,
+                                     size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle,
+                                     int              m,
+                                     int              n,
+                                     const double*    dl,
+                                     const double*    d,
+                                     const double*    du,
+                                     const double*    B,
+                                     int              ldb,
+                                     size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2_nopivot_bufferSizeExt(cusparseHandle_t handle,
+                                     int              m,
+                                     int              n,
+                                     const cuComplex* dl,
+                                     const cuComplex* d,
+                                     const cuComplex* du,
+                                     const cuComplex* B,
+                                     int              ldb,
+                                     size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2_nopivot_bufferSizeExt(cusparseHandle_t       handle,
+                                     int                    m,
+                                     int                    n,
+                                     const cuDoubleComplex* dl,
+                                     const cuDoubleComplex* d,
+                                     const cuDoubleComplex* du,
+                                     const cuDoubleComplex* B,
+                                     int                    ldb,
+                                     size_t*                bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2_nopivot(cusparseHandle_t handle,
+                       int              m,
+                       int              n,
+                       const float*     dl,
+                       const float*     d,
+                       const float*     du,
+                       float*           B,
+                       int              ldb,
+                       void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2_nopivot(cusparseHandle_t handle,
+                       int              m,
+                       int              n,
+                       const double*    dl,
+                       const double*    d,
+                       const double*    du,
+                       double*          B,
+                       int              ldb,
+                       void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2_nopivot(cusparseHandle_t handle,
+                       int              m,
+                       int              n,
+                       const cuComplex* dl,
+                       const cuComplex* d,
+                       const cuComplex* du,
+                       cuComplex*       B,
+                       int              ldb,
+                       void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2_nopivot(cusparseHandle_t       handle,
+                       int                    m,
+                       int                    n,
+                       const cuDoubleComplex* dl,
+                       const cuDoubleComplex* d,
+                       const cuDoubleComplex* du,
+                       cuDoubleComplex*       B,
+                       int                    ldb,
+                       void*                  pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                         int              m,
+                                         const float*     dl,
+                                         const float*     d,
+                                         const float*     du,
+                                         const float*     x,
+                                         int              batchCount,
+                                         int              batchStride,
+                                         size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                         int              m,
+                                         const double*    dl,
+                                         const double*    d,
+                                         const double*    du,
+                                         const double*    x,
+                                         int              batchCount,
+                                         int              batchStride,
+                                         size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                         int              m,
+                                         const cuComplex* dl,
+                                         const cuComplex* d,
+                                         const cuComplex* du,
+                                         const cuComplex* x,
+                                         int              batchCount,
+                                         int              batchStride,
+                                         size_t*          bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2StridedBatch_bufferSizeExt(cusparseHandle_t       handle,
+                                         int                    m,
+                                         const cuDoubleComplex* dl,
+                                         const cuDoubleComplex* d,
+                                         const cuDoubleComplex* du,
+                                         const cuDoubleComplex* x,
+                                         int                    batchCount,
+                                         int                    batchStride,
+                                         size_t* bufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsv2StridedBatch(cusparseHandle_t handle,
+                           int              m,
+                           const float*     dl,
+                           const float*     d,
+                           const float*     du,
+                           float*           x,
+                           int              batchCount,
+                           int              batchStride,
+                           void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsv2StridedBatch(cusparseHandle_t handle,
+                           int              m,
+                           const double*    dl,
+                           const double*    d,
+                           const double*    du,
+                           double*          x,
+                           int              batchCount,
+                           int              batchStride,
+                           void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsv2StridedBatch(cusparseHandle_t handle,
+                           int              m,
+                           const cuComplex* dl,
+                           const cuComplex* d,
+                           const cuComplex* du,
+                           cuComplex*       x,
+                           int              batchCount,
+                           int              batchStride,
+                           void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsv2StridedBatch(cusparseHandle_t       handle,
+                           int                    m,
+                           const cuDoubleComplex* dl,
+                           const cuDoubleComplex* d,
+                           const cuDoubleComplex* du,
+                           cuDoubleComplex*       x,
+                           int                    batchCount,
+                           int                    batchStride,
+                           void*                  pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const float*     dl,
+                                            const float*     d,
+                                            const float*     du,
+                                            const float*     x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                         int              algo,
+                                         int              m,
+                                         const double*    dl,
+                                         const double*    d,
+                                         const double*    du,
+                                         const double*    x,
+                                         int              batchCount,
+                                         size_t*          pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const cuComplex* dl,
+                                            const cuComplex* d,
+                                            const cuComplex* du,
+                                            const cuComplex* x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsvInterleavedBatch_bufferSizeExt(cusparseHandle_t       handle,
+                                            int                    algo,
+                                            int                    m,
+                                            const cuDoubleComplex* dl,
+                                            const cuDoubleComplex* d,
+                                            const cuDoubleComplex* du,
+                                            const cuDoubleComplex* x,
+                                            int                    batchCount,
+                                            size_t*        pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgtsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              float*           dl,
+                              float*           d,
+                              float*           du,
+                              float*           x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgtsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              double*          dl,
+                              double*          d,
+                              double*          du,
+                              double*          x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgtsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              cuComplex*       dl,
+                              cuComplex*       d,
+                              cuComplex*       du,
+                              cuComplex*       x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgtsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              cuDoubleComplex* dl,
+                              cuDoubleComplex* d,
+                              cuDoubleComplex* du,
+                              cuDoubleComplex* x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const float*     ds,
+                                            const float*     dl,
+                                            const float*     d,
+                                            const float*     du,
+                                            const float*     dw,
+                                            const float*     x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const double*    ds,
+                                            const double*    dl,
+                                            const double*    d,
+                                            const double*    du,
+                                            const double*    dw,
+                                            const double*    x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t handle,
+                                            int              algo,
+                                            int              m,
+                                            const cuComplex* ds,
+                                            const cuComplex* dl,
+                                            const cuComplex* d,
+                                            const cuComplex* du,
+                                            const cuComplex* dw,
+                                            const cuComplex* x,
+                                            int              batchCount,
+                                            size_t*         pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgpsvInterleavedBatch_bufferSizeExt(cusparseHandle_t       handle,
+                                            int                    algo,
+                                            int                    m,
+                                            const cuDoubleComplex* ds,
+                                            const cuDoubleComplex* dl,
+                                            const cuDoubleComplex* d,
+                                            const cuDoubleComplex* du,
+                                            const cuDoubleComplex* dw,
+                                            const cuDoubleComplex* x,
+                                            int                    batchCount,
+                                            size_t*         pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgpsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              float*           ds,
+                              float*           dl,
+                              float*           d,
+                              float*           du,
+                              float*           dw,
+                              float*           x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgpsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              double*          ds,
+                              double*          dl,
+                              double*          d,
+                              double*          du,
+                              double*          dw,
+                              double*          x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgpsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              cuComplex*       ds,
+                              cuComplex*       dl,
+                              cuComplex*       d,
+                              cuComplex*       du,
+                              cuComplex*       dw,
+                              cuComplex*       x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgpsvInterleavedBatch(cusparseHandle_t handle,
+                              int              algo,
+                              int              m,
+                              cuDoubleComplex* ds,
+                              cuDoubleComplex* dl,
+                              cuDoubleComplex* d,
+                              cuDoubleComplex* du,
+                              cuDoubleComplex* dw,
+                              cuDoubleComplex* x,
+                              int              batchCount,
+                              void*            pBuffer);
+
+//##############################################################################
+//# EXTRA ROUTINES
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrgeam2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                const float*             alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const float*             csrSortedValA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const float*             beta,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const float*             csrSortedValB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cusparseMatDescr_t descrC,
+                                const float*             csrSortedValC,
+                                const int*               csrSortedRowPtrC,
+                                const int*               csrSortedColIndC,
+                                size_t*                  pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrgeam2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                const double*            alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const double*            csrSortedValA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const double*            beta,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const double*            csrSortedValB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cusparseMatDescr_t descrC,
+                                const double*            csrSortedValC,
+                                const int*               csrSortedRowPtrC,
+                                const int*               csrSortedColIndC,
+                                size_t*                  pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrgeam2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                const cuComplex*         alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const cuComplex*         csrSortedValA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cuComplex*         beta,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const cuComplex*         csrSortedValB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cusparseMatDescr_t descrC,
+                                const cuComplex*         csrSortedValC,
+                                const int*               csrSortedRowPtrC,
+                                const int*               csrSortedColIndC,
+                                size_t*                  pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrgeam2_bufferSizeExt(cusparseHandle_t         handle,
+                                int                      m,
+                                int                      n,
+                                const cuDoubleComplex*   alpha,
+                                const cusparseMatDescr_t descrA,
+                                int                      nnzA,
+                                const cuDoubleComplex*   csrSortedValA,
+                                const int*               csrSortedRowPtrA,
+                                const int*               csrSortedColIndA,
+                                const cuDoubleComplex*   beta,
+                                const cusparseMatDescr_t descrB,
+                                int                      nnzB,
+                                const cuDoubleComplex*   csrSortedValB,
+                                const int*               csrSortedRowPtrB,
+                                const int*               csrSortedColIndB,
+                                const cusparseMatDescr_t descrC,
+                                const cuDoubleComplex*   csrSortedValC,
+                                const int*               csrSortedRowPtrC,
+                                const int*               csrSortedColIndC,
+                                size_t*                  pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrgeam2Nnz(cusparseHandle_t         handle,
+                     int                      m,
+                     int                      n,
+                     const cusparseMatDescr_t descrA,
+                     int                      nnzA,
+                     const int*               csrSortedRowPtrA,
+                     const int*               csrSortedColIndA,
+                     const cusparseMatDescr_t descrB,
+                     int                      nnzB,
+                     const int*               csrSortedRowPtrB,
+                     const int*               csrSortedColIndB,
+                     const cusparseMatDescr_t descrC,
+                     int*                     csrSortedRowPtrC,
+                     int*                     nnzTotalDevHostPtr,
+                     void*                    workspace);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrgeam2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  const float*             alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const float*             csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const float*             beta,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const float*             csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cusparseMatDescr_t descrC,
+                  float*                   csrSortedValC,
+                  int*                     csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrgeam2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  const double*            alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const double*            csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const double*            beta,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const double*            csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cusparseMatDescr_t descrC,
+                  double*                  csrSortedValC,
+                  int*                     csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrgeam2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  const cuComplex*         alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const cuComplex*         csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const cuComplex*         beta,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const cuComplex*         csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cusparseMatDescr_t descrC,
+                  cuComplex*               csrSortedValC,
+                  int*                     csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrgeam2(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  const cuDoubleComplex*   alpha,
+                  const cusparseMatDescr_t descrA,
+                  int                      nnzA,
+                  const cuDoubleComplex*   csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const cuDoubleComplex*   beta,
+                  const cusparseMatDescr_t descrB,
+                  int                      nnzB,
+                  const cuDoubleComplex*   csrSortedValB,
+                  const int*               csrSortedRowPtrB,
+                  const int*               csrSortedColIndB,
+                  const cusparseMatDescr_t descrC,
+                  cuDoubleComplex*         csrSortedValC,
+                  int*                     csrSortedRowPtrC,
+                  int*                     csrSortedColIndC,
+                  void*                    pBuffer);
+
+//##############################################################################
+//# SPARSE MATRIX REORDERING
+//##############################################################################
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsrcolor(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  const float*              csrSortedValA,
+                  const int*                csrSortedRowPtrA,
+                  const int*                csrSortedColIndA,
+                  const float*              fractionToColor,
+                  int*                      ncolors,
+                  int*                      coloring,
+                  int*                      reordering,
+                  const cusparseColorInfo_t info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsrcolor(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  const double*            csrSortedValA,
+                  const int*               csrSortedRowPtrA,
+                  const int*               csrSortedColIndA,
+                  const double*            fractionToColor,
+                  int*                     ncolors,
+                  int*                     coloring,
+                  int*                     reordering,
+                  const cusparseColorInfo_t info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsrcolor(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  const cuComplex*          csrSortedValA,
+                  const int*                csrSortedRowPtrA,
+                  const int*                csrSortedColIndA,
+                  const float*              fractionToColor,
+                  int*                      ncolors,
+                  int*                      coloring,
+                  int*                      reordering,
+                  const cusparseColorInfo_t info);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsrcolor(cusparseHandle_t          handle,
+                  int                       m,
+                  int                       nnz,
+                  const cusparseMatDescr_t  descrA,
+                  const cuDoubleComplex*    csrSortedValA,
+                  const int*                csrSortedRowPtrA,
+                  const int*                csrSortedColIndA,
+                  const double*             fractionToColor,
+                  int*                      ncolors,
+                  int*                      coloring,
+                  int*                      reordering,
+                  const cusparseColorInfo_t info);
+
+//##############################################################################
+//# SPARSE FORMAT CONVERSION
+//##############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSnnz(cusparseHandle_t         handle,
+             cusparseDirection_t      dirA,
+             int                      m,
+             int                      n,
+             const cusparseMatDescr_t descrA,
+             const float*             A,
+             int                      lda,
+             int*                     nnzPerRowCol,
+             int*                     nnzTotalDevHostPtr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnnz(cusparseHandle_t         handle,
+             cusparseDirection_t      dirA,
+             int                      m,
+             int                      n,
+             const cusparseMatDescr_t descrA,
+             const double*            A,
+             int                      lda,
+             int*                     nnzPerRowCol,
+             int*                     nnzTotalDevHostPtr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCnnz(cusparseHandle_t         handle,
+             cusparseDirection_t      dirA,
+             int                      m,
+             int                      n,
+             const cusparseMatDescr_t descrA,
+             const cuComplex*         A,
+             int                      lda,
+             int*                     nnzPerRowCol,
+             int*                     nnzTotalDevHostPtr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZnnz(cusparseHandle_t         handle,
+             cusparseDirection_t      dirA,
+             int                      m,
+             int                      n,
+             const cusparseMatDescr_t descrA,
+             const cuDoubleComplex*   A,
+             int                      lda,
+             int*                     nnzPerRowCol,
+             int*                     nnzTotalDevHostPtr);
+
+//##############################################################################
+//# SPARSE FORMAT CONVERSION
+//##############################################################################
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSnnz_compress(cusparseHandle_t         handle,
+                      int                      m,
+                      const cusparseMatDescr_t descr,
+                      const float*             csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      int*                     nnzPerRow,
+                      int*                     nnzC,
+                      float                    tol);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDnnz_compress(cusparseHandle_t         handle,
+                      int                      m,
+                      const cusparseMatDescr_t descr,
+                      const double*            csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      int*                     nnzPerRow,
+                      int*                     nnzC,
+                      double                   tol);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCnnz_compress(cusparseHandle_t         handle,
+                      int                      m,
+                      const cusparseMatDescr_t descr,
+                      const cuComplex*         csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      int*                     nnzPerRow,
+                      int*                     nnzC,
+                      cuComplex                tol);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZnnz_compress(cusparseHandle_t         handle,
+                      int                      m,
+                      const cusparseMatDescr_t descr,
+                      const cuDoubleComplex*   csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      int*                     nnzPerRow,
+                      int*                     nnzC,
+                      cuDoubleComplex          tol);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2csr_compress(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          const cusparseMatDescr_t descrA,
+                          const float*             csrSortedValA,
+                          const int*               csrSortedColIndA,
+                          const int*               csrSortedRowPtrA,
+                          int                      nnzA,
+                          const int*               nnzPerRow,
+                          float*                   csrSortedValC,
+                          int*                     csrSortedColIndC,
+                          int*                     csrSortedRowPtrC,
+                          float                    tol);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2csr_compress(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          const cusparseMatDescr_t descrA,
+                          const double*            csrSortedValA,
+                          const int*               csrSortedColIndA,
+                          const int*               csrSortedRowPtrA,
+                          int                      nnzA,
+                          const int*               nnzPerRow,
+                          double*                  csrSortedValC,
+                          int*                     csrSortedColIndC,
+                          int*                     csrSortedRowPtrC,
+                          double                   tol);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2csr_compress(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          const cusparseMatDescr_t descrA,
+                          const cuComplex*         csrSortedValA,
+                          const int*               csrSortedColIndA,
+                          const int*               csrSortedRowPtrA,
+                          int                      nnzA,
+                          const int*               nnzPerRow,
+                          cuComplex*               csrSortedValC,
+                          int*                     csrSortedColIndC,
+                          int*                     csrSortedRowPtrC,
+                          cuComplex                tol);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2csr_compress(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          const cusparseMatDescr_t descrA,
+                          const cuDoubleComplex*   csrSortedValA,
+                          const int*               csrSortedColIndA,
+                          const int*               csrSortedRowPtrA,
+                          int                      nnzA,
+                          const int*               nnzPerRow,
+                          cuDoubleComplex*         csrSortedValC,
+                          int*                     csrSortedColIndC,
+                          int*                     csrSortedRowPtrC,
+                          cuDoubleComplex          tol);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcoo2csr(cusparseHandle_t    handle,
+                 const int*          cooRowInd,
+                 int                 nnz,
+                 int                 m,
+                 int*                csrSortedRowPtr,
+                 cusparseIndexBase_t idxBase);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsr2coo(cusparseHandle_t    handle,
+                 const int*          csrSortedRowPtr,
+                 int                 nnz,
+                 int                 m,
+                 int*                cooRowInd,
+                 cusparseIndexBase_t idxBase);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsr2bsrNnz(cusparseHandle_t         handle,
+                    cusparseDirection_t      dirA,
+                    int                      m,
+                    int                      n,
+                    const cusparseMatDescr_t descrA,
+                    const int*               csrSortedRowPtrA,
+                    const int*               csrSortedColIndA,
+                    int                      blockDim,
+                    const cusparseMatDescr_t descrC,
+                    int*                     bsrSortedRowPtrC,
+                    int*                     nnzTotalDevHostPtr);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2bsr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      m,
+                 int                      n,
+                 const cusparseMatDescr_t descrA,
+                 const float*             csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 float*                   bsrSortedValC,
+                 int*                     bsrSortedRowPtrC,
+                 int*                     bsrSortedColIndC);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2bsr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      m,
+                 int                      n,
+                 const cusparseMatDescr_t descrA,
+                 const double*            csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 double*                  bsrSortedValC,
+                 int*                     bsrSortedRowPtrC,
+                 int*                     bsrSortedColIndC);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2bsr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      m,
+                 int                      n,
+                 const cusparseMatDescr_t descrA,
+                 const cuComplex*         csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 cuComplex*               bsrSortedValC,
+                 int*                     bsrSortedRowPtrC,
+                 int*                     bsrSortedColIndC);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2bsr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      m,
+                 int                      n,
+                 const cusparseMatDescr_t descrA,
+                 const cuDoubleComplex*   csrSortedValA,
+                 const int*               csrSortedRowPtrA,
+                 const int*               csrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 cuDoubleComplex*         bsrSortedValC,
+                 int*                     bsrSortedRowPtrC,
+                 int*                     bsrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSbsr2csr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nb,
+                 const cusparseMatDescr_t descrA,
+                 const float*             bsrSortedValA,
+                 const int*               bsrSortedRowPtrA,
+                 const int*               bsrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 float*                   csrSortedValC,
+                 int*                     csrSortedRowPtrC,
+                 int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDbsr2csr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nb,
+                 const cusparseMatDescr_t descrA,
+                 const double*            bsrSortedValA,
+                 const int*               bsrSortedRowPtrA,
+                 const int*               bsrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 double*                  csrSortedValC,
+                 int*                     csrSortedRowPtrC,
+                 int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCbsr2csr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nb,
+                 const cusparseMatDescr_t descrA,
+                 const cuComplex*         bsrSortedValA,
+                 const int*               bsrSortedRowPtrA,
+                 const int*               bsrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 cuComplex*               csrSortedValC,
+                 int*                     csrSortedRowPtrC,
+                 int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZbsr2csr(cusparseHandle_t         handle,
+                 cusparseDirection_t      dirA,
+                 int                      mb,
+                 int                      nb,
+                 const cusparseMatDescr_t descrA,
+                 const cuDoubleComplex*   bsrSortedValA,
+                 const int*               bsrSortedRowPtrA,
+                 const int*               bsrSortedColIndA,
+                 int                      blockDim,
+                 const cusparseMatDescr_t descrC,
+                 cuDoubleComplex*         csrSortedValC,
+                 int*                     csrSortedRowPtrC,
+                 int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsc_bufferSize(cusparseHandle_t handle,
+                                int              mb,
+                                int              nb,
+                                int              nnzb,
+                                const float*     bsrSortedVal,
+                                const int*       bsrSortedRowPtr,
+                                const int*       bsrSortedColInd,
+                                int              rowBlockDim,
+                                int              colBlockDim,
+                                int*             pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsc_bufferSize(cusparseHandle_t handle,
+                                int              mb,
+                                int              nb,
+                                int              nnzb,
+                                const double*    bsrSortedVal,
+                                const int*       bsrSortedRowPtr,
+                                const int*       bsrSortedColInd,
+                                int              rowBlockDim,
+                                int              colBlockDim,
+                                int*             pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsc_bufferSize(cusparseHandle_t handle,
+                                int              mb,
+                                int              nb,
+                                int              nnzb,
+                                const cuComplex* bsrSortedVal,
+                                const int*       bsrSortedRowPtr,
+                                const int*       bsrSortedColInd,
+                                int              rowBlockDim,
+                                int              colBlockDim,
+                                int*             pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsc_bufferSize(cusparseHandle_t       handle,
+                                int                    mb,
+                                int                    nb,
+                                int                    nnzb,
+                                const cuDoubleComplex* bsrSortedVal,
+                                const int*             bsrSortedRowPtr,
+                                const int*             bsrSortedColInd,
+                                int                    rowBlockDim,
+                                int                    colBlockDim,
+                                int*                   pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle,
+                                   int              mb,
+                                   int              nb,
+                                   int              nnzb,
+                                   const float*     bsrSortedVal,
+                                   const int*       bsrSortedRowPtr,
+                                   const int*       bsrSortedColInd,
+                                   int              rowBlockDim,
+                                   int              colBlockDim,
+                                   size_t*          pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle,
+                                   int              mb,
+                                   int              nb,
+                                   int              nnzb,
+                                   const double*    bsrSortedVal,
+                                   const int*       bsrSortedRowPtr,
+                                   const int*       bsrSortedColInd,
+                                   int              rowBlockDim,
+                                   int              colBlockDim,
+                                   size_t*          pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsc_bufferSizeExt(cusparseHandle_t handle,
+                                   int              mb,
+                                   int              nb,
+                                   int              nnzb,
+                                   const cuComplex* bsrSortedVal,
+                                   const int*       bsrSortedRowPtr,
+                                   const int*       bsrSortedColInd,
+                                   int              rowBlockDim,
+                                   int              colBlockDim,
+                                   size_t*          pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsc_bufferSizeExt(cusparseHandle_t       handle,
+                                   int                    mb,
+                                   int                    nb,
+                                   int                    nnzb,
+                                   const cuDoubleComplex* bsrSortedVal,
+                                   const int*             bsrSortedRowPtr,
+                                   const int*             bsrSortedColInd,
+                                   int                    rowBlockDim,
+                                   int                    colBlockDim,
+                                   size_t*                pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsc(cusparseHandle_t handle,
+                     int              mb,
+                     int              nb,
+                     int              nnzb,
+                     const float*     bsrSortedVal,
+                     const int* bsrSortedRowPtr,
+                     const int* bsrSortedColInd,
+                     int        rowBlockDim,
+                     int        colBlockDim,
+                     float*     bscVal,
+                     int*       bscRowInd,
+                     int*       bscColPtr,
+                     cusparseAction_t copyValues,
+                     cusparseIndexBase_t idxBase,
+                     void*               pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsc(cusparseHandle_t    handle,
+                     int                 mb,
+                     int                 nb,
+                     int                 nnzb,
+                     const double*       bsrSortedVal,
+                     const int*          bsrSortedRowPtr,
+                     const int*          bsrSortedColInd,
+                     int                 rowBlockDim,
+                     int                 colBlockDim,
+                     double*             bscVal,
+                     int*                bscRowInd,
+                     int*                bscColPtr,
+                     cusparseAction_t    copyValues,
+                     cusparseIndexBase_t idxBase,
+                     void*               pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsc(cusparseHandle_t    handle,
+                     int                 mb,
+                     int                 nb,
+                     int                 nnzb,
+                     const cuComplex*    bsrSortedVal,
+                     const int*          bsrSortedRowPtr,
+                     const int*          bsrSortedColInd,
+                     int                 rowBlockDim,
+                     int                 colBlockDim,
+                     cuComplex*          bscVal,
+                     int*                bscRowInd,
+                     int*                bscColPtr,
+                     cusparseAction_t    copyValues,
+                     cusparseIndexBase_t idxBase,
+                     void*               pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsc(cusparseHandle_t       handle,
+                     int                    mb,
+                     int                    nb,
+                     int                    nnzb,
+                     const cuDoubleComplex* bsrSortedVal,
+                     const int*             bsrSortedRowPtr,
+                     const int*             bsrSortedColInd,
+                     int                    rowBlockDim,
+                     int                    colBlockDim,
+                     cuDoubleComplex*       bscVal,
+                     int*                   bscRowInd,
+                     int*                   bscColPtr,
+                     cusparseAction_t       copyValues,
+                     cusparseIndexBase_t    idxBase,
+                     void*                  pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseXgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const float*             bsrSortedValA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   float*                   csrSortedValC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const double*            bsrSortedValA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   double*                  csrSortedValC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const cuComplex*         bsrSortedValA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   cuComplex*               csrSortedValC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2csr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      mb,
+                   int                      nb,
+                   const cusparseMatDescr_t descrA,
+                   const cuDoubleComplex*   bsrSortedValA,
+                   const int*               bsrSortedRowPtrA,
+                   const int*               bsrSortedColIndA,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   const cusparseMatDescr_t descrC,
+                   cuDoubleComplex*         csrSortedValC,
+                   int*                     csrSortedRowPtrC,
+                   int*                     csrSortedColIndC);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              int                      m,
+                              int                      n,
+                              const cusparseMatDescr_t descrA,
+                              const float*             csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              int                      rowBlockDim,
+                              int                      colBlockDim,
+                              int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              int                      m,
+                              int                      n,
+                              const cusparseMatDescr_t descrA,
+                              const double*            csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              int                      rowBlockDim,
+                              int                      colBlockDim,
+                              int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              int                      m,
+                              int                      n,
+                              const cusparseMatDescr_t descrA,
+                              const cuComplex*         csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              int                      rowBlockDim,
+                              int                      colBlockDim,
+                              int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                              cusparseDirection_t      dirA,
+                              int                      m,
+                              int                      n,
+                              const cusparseMatDescr_t descrA,
+                              const cuDoubleComplex*   csrSortedValA,
+                              const int*               csrSortedRowPtrA,
+                              const int*               csrSortedColIndA,
+                              int                      rowBlockDim,
+                              int                      colBlockDim,
+                              int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                 cusparseDirection_t      dirA,
+                                 int                      m,
+                                 int                      n,
+                                 const cusparseMatDescr_t descrA,
+                                 const float*             csrSortedValA,
+                                 const int*               csrSortedRowPtrA,
+                                 const int*               csrSortedColIndA,
+                                 int                      rowBlockDim,
+                                 int                      colBlockDim,
+                                 size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                 cusparseDirection_t      dirA,
+                                 int                      m,
+                                 int                      n,
+                                 const cusparseMatDescr_t descrA,
+                                 const double*            csrSortedValA,
+                                 const int*               csrSortedRowPtrA,
+                                 const int*               csrSortedColIndA,
+                                 int                      rowBlockDim,
+                                 int                      colBlockDim,
+                                 size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                 cusparseDirection_t      dirA,
+                                 int                      m,
+                                 int                      n,
+                                 const cusparseMatDescr_t descrA,
+                                 const cuComplex*         csrSortedValA,
+                                 const int*               csrSortedRowPtrA,
+                                 const int*               csrSortedColIndA,
+                                 int                      rowBlockDim,
+                                 int                      colBlockDim,
+                                 size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                 cusparseDirection_t      dirA,
+                                 int                      m,
+                                 int                      n,
+                                 const cusparseMatDescr_t descrA,
+                                 const cuDoubleComplex*   csrSortedValA,
+                                 const int*               csrSortedRowPtrA,
+                                 const int*               csrSortedColIndA,
+                                 int                      rowBlockDim,
+                                 int                      colBlockDim,
+                                 size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsr2gebsrNnz(cusparseHandle_t         handle,
+                      cusparseDirection_t      dirA,
+                      int                      m,
+                      int                      n,
+                      const cusparseMatDescr_t descrA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      const cusparseMatDescr_t descrC,
+                      int*                     bsrSortedRowPtrC,
+                      int                      rowBlockDim,
+                      int                      colBlockDim,
+                      int*                     nnzTotalDevHostPtr,
+                      void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2gebsr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const float*             csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   const cusparseMatDescr_t descrC,
+                   float*                   bsrSortedValC,
+                   int*                     bsrSortedRowPtrC,
+                   int*                     bsrSortedColIndC,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2gebsr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const double*            csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   const cusparseMatDescr_t descrC,
+                   double*                  bsrSortedValC,
+                   int*                     bsrSortedRowPtrC,
+                   int*                     bsrSortedColIndC,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2gebsr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuComplex*         csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   const cusparseMatDescr_t descrC,
+                   cuComplex*               bsrSortedValC,
+                   int*                     bsrSortedRowPtrC,
+                   int*                     bsrSortedColIndC,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2gebsr(cusparseHandle_t         handle,
+                   cusparseDirection_t      dirA,
+                   int                      m,
+                   int                      n,
+                   const cusparseMatDescr_t descrA,
+                   const cuDoubleComplex*   csrSortedValA,
+                   const int*               csrSortedRowPtrA,
+                   const int*               csrSortedColIndA,
+                   const cusparseMatDescr_t descrC,
+                   cuDoubleComplex*         bsrSortedValC,
+                   int*                     bsrSortedRowPtrC,
+                   int*                     bsrSortedColIndC,
+                   int                      rowBlockDim,
+                   int                      colBlockDim,
+                   void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                const float*             bsrSortedValA,
+                                const int*               bsrSortedRowPtrA,
+                                const int*               bsrSortedColIndA,
+                                int                      rowBlockDimA,
+                                int                      colBlockDimA,
+                                int                      rowBlockDimC,
+                                int                      colBlockDimC,
+                                int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                const double*            bsrSortedValA,
+                                const int*               bsrSortedRowPtrA,
+                                const int*               bsrSortedColIndA,
+                                int                      rowBlockDimA,
+                                int                      colBlockDimA,
+                                int                      rowBlockDimC,
+                                int                      colBlockDimC,
+                                int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                const cuComplex*         bsrSortedValA,
+                                const int*               bsrSortedRowPtrA,
+                                const int*               bsrSortedColIndA,
+                                int                      rowBlockDimA,
+                                int                      colBlockDimA,
+                                int                      rowBlockDimC,
+                                int                      colBlockDimC,
+                                int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsr_bufferSize(cusparseHandle_t         handle,
+                                cusparseDirection_t      dirA,
+                                int                      mb,
+                                int                      nb,
+                                int                      nnzb,
+                                const cusparseMatDescr_t descrA,
+                                const cuDoubleComplex*   bsrSortedValA,
+                                const int*               bsrSortedRowPtrA,
+                                const int*               bsrSortedColIndA,
+                                int                      rowBlockDimA,
+                                int                      colBlockDimA,
+                                int                      rowBlockDimC,
+                                int                      colBlockDimC,
+                                int*                     pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                   cusparseDirection_t      dirA,
+                                   int                      mb,
+                                   int                      nb,
+                                   int                      nnzb,
+                                   const cusparseMatDescr_t descrA,
+                                   const float*             bsrSortedValA,
+                                   const int*               bsrSortedRowPtrA,
+                                   const int*               bsrSortedColIndA,
+                                   int                      rowBlockDimA,
+                                   int                      colBlockDimA,
+                                   int                      rowBlockDimC,
+                                   int                      colBlockDimC,
+                                   size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                   cusparseDirection_t      dirA,
+                                   int                      mb,
+                                   int                      nb,
+                                   int                      nnzb,
+                                   const cusparseMatDescr_t descrA,
+                                   const double*            bsrSortedValA,
+                                   const int*               bsrSortedRowPtrA,
+                                   const int*               bsrSortedColIndA,
+                                   int                      rowBlockDimA,
+                                   int                      colBlockDimA,
+                                   int                      rowBlockDimC,
+                                   int                      colBlockDimC,
+                                   size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                   cusparseDirection_t      dirA,
+                                   int                      mb,
+                                   int                      nb,
+                                   int                      nnzb,
+                                   const cusparseMatDescr_t descrA,
+                                   const cuComplex*         bsrSortedValA,
+                                   const int*               bsrSortedRowPtrA,
+                                   const int*               bsrSortedColIndA,
+                                   int                      rowBlockDimA,
+                                   int                      colBlockDimA,
+                                   int                      rowBlockDimC,
+                                   int                      colBlockDimC,
+                                   size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsr_bufferSizeExt(cusparseHandle_t         handle,
+                                   cusparseDirection_t      dirA,
+                                   int                      mb,
+                                   int                      nb,
+                                   int                      nnzb,
+                                   const cusparseMatDescr_t descrA,
+                                   const cuDoubleComplex*   bsrSortedValA,
+                                   const int*               bsrSortedRowPtrA,
+                                   const int*               bsrSortedColIndA,
+                                   int                      rowBlockDimA,
+                                   int                      colBlockDimA,
+                                   int                      rowBlockDimC,
+                                   int                      colBlockDimC,
+                                   size_t*                  pBufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXgebsr2gebsrNnz(cusparseHandle_t         handle,
+                        cusparseDirection_t      dirA,
+                        int                      mb,
+                        int                      nb,
+                        int                      nnzb,
+                        const cusparseMatDescr_t descrA,
+                        const int*               bsrSortedRowPtrA,
+                        const int*               bsrSortedColIndA,
+                        int                      rowBlockDimA,
+                        int                      colBlockDimA,
+                        const cusparseMatDescr_t descrC,
+                        int*                     bsrSortedRowPtrC,
+                        int                      rowBlockDimC,
+                        int                      colBlockDimC,
+                        int*                     nnzTotalDevHostPtr,
+                        void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSgebsr2gebsr(cusparseHandle_t         handle,
+                     cusparseDirection_t      dirA,
+                     int                      mb,
+                     int                      nb,
+                     int                      nnzb,
+                     const cusparseMatDescr_t descrA,
+                     const float*             bsrSortedValA,
+                     const int*               bsrSortedRowPtrA,
+                     const int*               bsrSortedColIndA,
+                     int                      rowBlockDimA,
+                     int                      colBlockDimA,
+                     const cusparseMatDescr_t descrC,
+                     float*                   bsrSortedValC,
+                     int*                     bsrSortedRowPtrC,
+                     int*                     bsrSortedColIndC,
+                     int                      rowBlockDimC,
+                     int                      colBlockDimC,
+                     void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDgebsr2gebsr(cusparseHandle_t         handle,
+                     cusparseDirection_t      dirA,
+                     int                      mb,
+                     int                      nb,
+                     int                      nnzb,
+                     const cusparseMatDescr_t descrA,
+                     const double*            bsrSortedValA,
+                     const int*               bsrSortedRowPtrA,
+                     const int*               bsrSortedColIndA,
+                     int                      rowBlockDimA,
+                     int                      colBlockDimA,
+                     const cusparseMatDescr_t descrC,
+                     double*                  bsrSortedValC,
+                     int*                     bsrSortedRowPtrC,
+                     int*                     bsrSortedColIndC,
+                     int                      rowBlockDimC,
+                     int                      colBlockDimC,
+                     void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCgebsr2gebsr(cusparseHandle_t         handle,
+                     cusparseDirection_t      dirA,
+                     int                      mb,
+                     int                      nb,
+                     int                      nnzb,
+                     const cusparseMatDescr_t descrA,
+                     const cuComplex*         bsrSortedValA,
+                     const int*               bsrSortedRowPtrA,
+                     const int*               bsrSortedColIndA,
+                     int                      rowBlockDimA,
+                     int                      colBlockDimA,
+                     const cusparseMatDescr_t descrC,
+                     cuComplex*               bsrSortedValC,
+                     int*                     bsrSortedRowPtrC,
+                     int*                     bsrSortedColIndC,
+                     int                      rowBlockDimC,
+                     int                      colBlockDimC,
+                     void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseZgebsr2gebsr(cusparseHandle_t         handle,
+                     cusparseDirection_t      dirA,
+                     int                      mb,
+                     int                      nb,
+                     int                      nnzb,
+                     const cusparseMatDescr_t descrA,
+                     const cuDoubleComplex*   bsrSortedValA,
+                     const int*               bsrSortedRowPtrA,
+                     const int*               bsrSortedColIndA,
+                     int                      rowBlockDimA,
+                     int                      colBlockDimA,
+                     const cusparseMatDescr_t descrC,
+                     cuDoubleComplex*         bsrSortedValC,
+                     int*                     bsrSortedRowPtrC,
+                     int*                     bsrSortedColIndC,
+                     int                      rowBlockDimC,
+                     int                      colBlockDimC,
+                     void*                    pBuffer);
+
+//##############################################################################
+//# SPARSE MATRIX SORTING
+//##############################################################################
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateIdentityPermutation(cusparseHandle_t handle,
+                                  int              n,
+                                  int*             p);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcoosort_bufferSizeExt(cusparseHandle_t handle,
+                               int              m,
+                               int              n,
+                               int              nnz,
+                               const int*       cooRowsA,
+                               const int*       cooColsA,
+                               size_t*          pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcoosortByRow(cusparseHandle_t handle,
+                      int              m,
+                      int              n,
+                      int              nnz,
+                      int*             cooRowsA,
+                      int*             cooColsA,
+                      int*             P,
+                      void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcoosortByColumn(cusparseHandle_t handle,
+                         int              m,
+                         int              n,
+                         int              nnz,
+                         int*             cooRowsA,
+                         int*             cooColsA,
+                         int*             P,
+                         void*            pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrsort_bufferSizeExt(cusparseHandle_t handle,
+                               int              m,
+                               int              n,
+                               int              nnz,
+                               const int*       csrRowPtrA,
+                               const int*       csrColIndA,
+                               size_t*          pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcsrsort(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      n,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 const int*               csrRowPtrA,
+                 int*                     csrColIndA,
+                 int*                     P,
+                 void*                    pBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcscsort_bufferSizeExt(cusparseHandle_t handle,
+                               int              m,
+                               int              n,
+                               int              nnz,
+                               const int*       cscColPtrA,
+                               const int*       cscRowIndA,
+                               size_t*          pBufferSizeInBytes);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseXcscsort(cusparseHandle_t         handle,
+                 int                      m,
+                 int                      n,
+                 int                      nnz,
+                 const cusparseMatDescr_t descrA,
+                 const int*               cscColPtrA,
+                 int*                     cscRowIndA,
+                 int*                     P,
+                 void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsru2csr_bufferSizeExt(cusparseHandle_t handle,
+                                int              m,
+                                int              n,
+                                int              nnz,
+                                float*           csrVal,
+                                const int*       csrRowPtr,
+                                int*             csrColInd,
+                                csru2csrInfo_t   info,
+                                size_t*          pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsru2csr_bufferSizeExt(cusparseHandle_t handle,
+                                int              m,
+                                int              n,
+                                int              nnz,
+                                double*          csrVal,
+                                const int*       csrRowPtr,
+                                int*             csrColInd,
+                                csru2csrInfo_t   info,
+                                size_t*          pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsru2csr_bufferSizeExt(cusparseHandle_t handle,
+                                int              m,
+                                int              n,
+                                int              nnz,
+                                cuComplex*       csrVal,
+                                const int*       csrRowPtr,
+                                int*             csrColInd,
+                                csru2csrInfo_t   info,
+                                size_t*          pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsru2csr_bufferSizeExt(cusparseHandle_t handle,
+                                int              m,
+                                int              n,
+                                int              nnz,
+                                cuDoubleComplex* csrVal,
+                                const int*       csrRowPtr,
+                                int*             csrColInd,
+                                csru2csrInfo_t   info,
+                                size_t*          pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsru2csr(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  float*                   csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsru2csr(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  double*                  csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsru2csr(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuComplex*               csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsru2csr(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuDoubleComplex*         csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseScsr2csru(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  float*                   csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDcsr2csru(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  double*                  csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseCcsr2csru(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuComplex*               csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseZcsr2csru(cusparseHandle_t         handle,
+                  int                      m,
+                  int                      n,
+                  int                      nnz,
+                  const cusparseMatDescr_t descrA,
+                  cuDoubleComplex*         csrVal,
+                  const int*               csrRowPtr,
+                  int*                     csrColInd,
+                  csru2csrInfo_t           info,
+                  void*                    pBuffer);
+
+#if defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneDense2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                      int                      m,
+                                      int                      n,
+                                      const __half*            A,
+                                      int                      lda,
+                                      const __half*            threshold,
+                                      const cusparseMatDescr_t descrC,
+                                      const __half*            csrSortedValC,
+                                      const int*               csrSortedRowPtrC,
+                                      const int*               csrSortedColIndC,
+                                      size_t* pBufferSizeInBytes);
+#endif // defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                      int                      m,
+                                      int                      n,
+                                      const float*             A,
+                                      int                      lda,
+                                      const float*             threshold,
+                                      const cusparseMatDescr_t descrC,
+                                      const float*             csrSortedValC,
+                                      const int*               csrSortedRowPtrC,
+                                      const int*               csrSortedColIndC,
+                                      size_t* pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                      int                      m,
+                                      int                      n,
+                                      const double*            A,
+                                      int                      lda,
+                                      const double*            threshold,
+                                      const cusparseMatDescr_t descrC,
+                                      const double*            csrSortedValC,
+                                      const int*               csrSortedRowPtrC,
+                                      const int*               csrSortedColIndC,
+                                      size_t*               pBufferSizeInBytes);
+
+#if defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneDense2csrNnz(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      n,
+                           const __half*            A,
+                           int                      lda,
+                           const __half*            threshold,
+                           const cusparseMatDescr_t descrC,
+                           int*                     csrRowPtrC,
+                           int*                     nnzTotalDevHostPtr,
+                           void*                    pBuffer);
+#endif // defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csrNnz(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      n,
+                           const float*             A,
+                           int                      lda,
+                           const float*             threshold,
+                           const cusparseMatDescr_t descrC,
+                           int*                     csrRowPtrC,
+                           int*                     nnzTotalDevHostPtr,
+                           void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csrNnz(cusparseHandle_t         handle,
+                           int                      m,
+                           int                      n,
+                           const double*            A,
+                           int                      lda,
+                           const double*            threshold,
+                           const cusparseMatDescr_t descrC,
+                           int*                     csrSortedRowPtrC,
+                           int*                     nnzTotalDevHostPtr,
+                           void*                    pBuffer);
+
+#if defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneDense2csr(cusparseHandle_t         handle,
+                        int                      m,
+                        int                      n,
+                        const __half*            A,
+                        int                      lda,
+                        const __half*            threshold,
+                        const cusparseMatDescr_t descrC,
+                        __half*                  csrSortedValC,
+                        const int*               csrSortedRowPtrC,
+                        int*                     csrSortedColIndC,
+                        void*                    pBuffer);
+#endif // defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csr(cusparseHandle_t         handle,
+                        int                      m,
+                        int                      n,
+                        const float*             A,
+                        int                      lda,
+                        const float*             threshold,
+                        const cusparseMatDescr_t descrC,
+                        float*                   csrSortedValC,
+                        const int*               csrSortedRowPtrC,
+                        int*                     csrSortedColIndC,
+                        void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csr(cusparseHandle_t         handle,
+                        int                      m,
+                        int                      n,
+                        const double*            A,
+                        int                      lda,
+                        const double*            threshold,
+                        const cusparseMatDescr_t descrC,
+                        double*                  csrSortedValC,
+                        const int*               csrSortedRowPtrC,
+                        int*                     csrSortedColIndC,
+                        void*                    pBuffer);
+
+#if defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneCsr2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const __half*            csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    const __half*            threshold,
+                                    const cusparseMatDescr_t descrC,
+                                    const __half*            csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    const int*               csrSortedColIndC,
+                                    size_t* pBufferSizeInBytes);
+#endif // defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const float*             csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    const float*             threshold,
+                                    const cusparseMatDescr_t descrC,
+                                    const float*             csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    const int*               csrSortedColIndC,
+                                    size_t*                 pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csr_bufferSizeExt(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const double*            csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    const double*            threshold,
+                                    const cusparseMatDescr_t descrC,
+                                    const double*            csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    const int*               csrSortedColIndC,
+                                    size_t*                 pBufferSizeInBytes);
+
+#if defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneCsr2csrNnz(cusparseHandle_t         handle,
+                         int                      m,
+                         int                      n,
+                         int                      nnzA,
+                         const cusparseMatDescr_t descrA,
+                         const __half*            csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const __half*            threshold,
+                         const cusparseMatDescr_t descrC,
+                         int*                     csrSortedRowPtrC,
+                         int*                     nnzTotalDevHostPtr,
+                         void*                    pBuffer);
+#endif // defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csrNnz(cusparseHandle_t         handle,
+                         int                      m,
+                         int                      n,
+                         int                      nnzA,
+                         const cusparseMatDescr_t descrA,
+                         const float*             csrSortedValA,
+                         const int*               csrSortedRowPtrA,
+                         const int*               csrSortedColIndA,
+                         const float*             threshold,
+                         const cusparseMatDescr_t descrC,
+                         int*                     csrSortedRowPtrC,
+                         int*                     nnzTotalDevHostPtr,
+                         void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csrNnz(cusparseHandle_t         handle,
+                          int                      m,
+                          int                      n,
+                          int                      nnzA,
+                          const cusparseMatDescr_t descrA,
+                          const double*            csrSortedValA,
+                          const int*               csrSortedRowPtrA,
+                          const int*               csrSortedColIndA,
+                          const double*            threshold,
+                          const cusparseMatDescr_t descrC,
+                          int*                     csrSortedRowPtrC,
+                          int*                     nnzTotalDevHostPtr,
+                          void*                    pBuffer);
+
+#if defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneCsr2csr(cusparseHandle_t         handle,
+                      int                      m,
+                      int                      n,
+                      int                      nnzA,
+                      const cusparseMatDescr_t descrA,
+                      const __half*            csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      const __half*            threshold,
+                      const cusparseMatDescr_t descrC,
+                      __half*                  csrSortedValC,
+                      const int*               csrSortedRowPtrC,
+                      int*                     csrSortedColIndC,
+                      void*                    pBuffer);
+#endif // defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csr(cusparseHandle_t         handle,
+                      int                      m,
+                      int                      n,
+                      int                      nnzA,
+                      const cusparseMatDescr_t descrA,
+                      const float*             csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      const float*             threshold,
+                      const cusparseMatDescr_t descrC,
+                      float*                   csrSortedValC,
+                      const int*               csrSortedRowPtrC,
+                      int*                     csrSortedColIndC,
+                      void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csr(cusparseHandle_t         handle,
+                      int                      m,
+                      int                      n,
+                      int                      nnzA,
+                      const cusparseMatDescr_t descrA,
+                      const double*            csrSortedValA,
+                      const int*               csrSortedRowPtrA,
+                      const int*               csrSortedColIndA,
+                      const double*            threshold,
+                      const cusparseMatDescr_t descrC,
+                      double*                  csrSortedValC,
+                      const int*               csrSortedRowPtrC,
+                      int*                     csrSortedColIndC,
+                      void*                    pBuffer);
+
+#if defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneDense2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   const __half*            A,
+                                   int                      lda,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const __half*            csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes);
+
+#endif // defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   const float*             A,
+                                   int                      lda,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const float*             csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   const double*            A,
+                                   int                      lda,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const double*            csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes);
+
+#if defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneDense2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const __half*            A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+#endif // defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const float*             A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const double*            A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+#if defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneDense2csrByPercentage(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const __half*            A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    __half*                  csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    int*                     csrSortedColIndC,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+#endif // defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneDense2csrByPercentage(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const float*             A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    float*                   csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    int*                     csrSortedColIndC,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneDense2csrByPercentage(cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    const double*            A,
+                                    int                      lda,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    double*                  csrSortedValC,
+                                    const int*               csrSortedRowPtrC,
+                                    int*                     csrSortedColIndC,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+#if defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneCsr2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   int                      nnzA,
+                                   const cusparseMatDescr_t descrA,
+                                   const __half*            csrSortedValA,
+                                   const int*               csrSortedRowPtrA,
+                                   const int*               csrSortedColIndA,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const __half*            csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes);
+
+#endif // defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   int                      nnzA,
+                                   const cusparseMatDescr_t descrA,
+                                   const float*             csrSortedValA,
+                                   const int*               csrSortedRowPtrA,
+                                   const int*               csrSortedColIndA,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const float*             csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csrByPercentage_bufferSizeExt(
+                                   cusparseHandle_t         handle,
+                                   int                      m,
+                                   int                      n,
+                                   int                      nnzA,
+                                   const cusparseMatDescr_t descrA,
+                                   const double*            csrSortedValA,
+                                   const int*               csrSortedRowPtrA,
+                                   const int*               csrSortedColIndA,
+                                   float                    percentage,
+                                   const cusparseMatDescr_t descrC,
+                                   const double*            csrSortedValC,
+                                   const int*               csrSortedRowPtrC,
+                                   const int*               csrSortedColIndC,
+                                   pruneInfo_t              info,
+                                   size_t*                  pBufferSizeInBytes);
+
+#if defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneCsr2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const __half*            csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrSortedRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+#endif // defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const float*             csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrSortedRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csrNnzByPercentage(
+                                    cusparseHandle_t         handle,
+                                    int                      m,
+                                    int                      n,
+                                    int                      nnzA,
+                                    const cusparseMatDescr_t descrA,
+                                    const double*            csrSortedValA,
+                                    const int*               csrSortedRowPtrA,
+                                    const int*               csrSortedColIndA,
+                                    float                    percentage,
+                                    const cusparseMatDescr_t descrC,
+                                    int*                     csrSortedRowPtrC,
+                                    int*                     nnzTotalDevHostPtr,
+                                    pruneInfo_t              info,
+                                    void*                    pBuffer);
+
+#if defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseHpruneCsr2csrByPercentage(cusparseHandle_t         handle,
+                                  int                      m,
+                                  int                      n,
+                                  int                      nnzA,
+                                  const cusparseMatDescr_t descrA,
+                                  const __half*            csrSortedValA,
+                                  const int*               csrSortedRowPtrA,
+                                  const int*               csrSortedColIndA,
+                                  float percentage, /* between 0 to 100 */
+                                  const cusparseMatDescr_t descrC,
+                                  __half*                  csrSortedValC,
+                                  const int*               csrSortedRowPtrC,
+                                  int*                     csrSortedColIndC,
+                                  pruneInfo_t              info,
+                                  void*                    pBuffer);
+
+#endif // defined(__cplusplus)
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseSpruneCsr2csrByPercentage(cusparseHandle_t         handle,
+                                  int                      m,
+                                  int                      n,
+                                  int                      nnzA,
+                                  const cusparseMatDescr_t descrA,
+                                  const float*             csrSortedValA,
+                                  const int*               csrSortedRowPtrA,
+                                  const int*               csrSortedColIndA,
+                                  float                    percentage,
+                                  const cusparseMatDescr_t descrC,
+                                  float*                   csrSortedValC,
+                                  const int*               csrSortedRowPtrC,
+                                  int*                     csrSortedColIndC,
+                                  pruneInfo_t              info,
+                                  void*                    pBuffer);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseDpruneCsr2csrByPercentage(cusparseHandle_t         handle,
+                                  int                      m,
+                                  int                      n,
+                                  int                      nnzA,
+                                  const cusparseMatDescr_t descrA,
+                                  const double*            csrSortedValA,
+                                  const int*               csrSortedRowPtrA,
+                                  const int*               csrSortedColIndA,
+                                  float                    percentage,
+                                  const cusparseMatDescr_t descrC,
+                                  double*                  csrSortedValC,
+                                  const int*               csrSortedRowPtrC,
+                                  int*                     csrSortedColIndC,
+                                  pruneInfo_t              info,
+                                  void*                    pBuffer);
+
+//##############################################################################
+//# CSR2CSC
+//##############################################################################
+
+typedef enum {
+    CUSPARSE_CSR2CSC_ALG_DEFAULT = 1,
+    CUSPARSE_CSR2CSC_ALG1 = 1
+} cusparseCsr2CscAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsr2cscEx2(cusparseHandle_t     handle,
+                   int                  m,
+                   int                  n,
+                   int                  nnz,
+                   const void*          csrVal,
+                   const int*           csrRowPtr,
+                   const int*           csrColInd,
+                   void*                cscVal,
+                   int*                 cscColPtr,
+                   int*                 cscRowInd,
+                   cudaDataType         valType,
+                   cusparseAction_t     copyValues,
+                   cusparseIndexBase_t  idxBase,
+                   cusparseCsr2CscAlg_t alg,
+                   void*                buffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsr2cscEx2_bufferSize(cusparseHandle_t     handle,
+                              int                  m,
+                              int                  n,
+                              int                  nnz,
+                              const void*          csrVal,
+                              const int*           csrRowPtr,
+                              const int*           csrColInd,
+                              void*                cscVal,
+                              int*                 cscColPtr,
+                              int*                 cscRowInd,
+                              cudaDataType         valType,
+                              cusparseAction_t     copyValues,
+                              cusparseIndexBase_t  idxBase,
+                              cusparseCsr2CscAlg_t alg,
+                              size_t*              bufferSize);
+
+// #############################################################################
+// # GENERIC APIs - Enumerators and Opaque Data Structures
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_FORMAT_CSR            = 1, ///< Compressed Sparse Row (CSR)
+    CUSPARSE_FORMAT_CSC            = 2, ///< Compressed Sparse Column (CSC)
+    CUSPARSE_FORMAT_COO            = 3, ///< Coordinate (COO) - Structure of Arrays
+    CUSPARSE_FORMAT_BLOCKED_ELL    = 5, ///< Blocked ELL
+    CUSPARSE_FORMAT_BSR            = 6, ///< Blocked Compressed Sparse Row (BSR)
+    CUSPARSE_FORMAT_SLICED_ELLPACK = 7 ///< Sliced ELL
+} cusparseFormat_t;
+
+typedef enum {
+    CUSPARSE_ORDER_COL = 1, ///< Column-Major Order - Matrix memory layout
+    CUSPARSE_ORDER_ROW = 2  ///< Row-Major Order - Matrix memory layout
+} cusparseOrder_t;
+
+typedef enum {
+    CUSPARSE_INDEX_16U = 1, ///< 16-bit unsigned integer for matrix/vector
+                            ///< indices
+    CUSPARSE_INDEX_32I = 2, ///< 32-bit signed integer for matrix/vector indices
+    CUSPARSE_INDEX_64I = 3  ///< 64-bit signed integer for matrix/vector indices
+} cusparseIndexType_t;
+
+//------------------------------------------------------------------------------
+
+struct cusparseSpVecDescr;
+struct cusparseDnVecDescr;
+struct cusparseSpMatDescr;
+struct cusparseDnMatDescr;
+
+typedef struct cusparseSpVecDescr* cusparseSpVecDescr_t;
+typedef struct cusparseDnVecDescr* cusparseDnVecDescr_t;
+typedef struct cusparseSpMatDescr* cusparseSpMatDescr_t;
+typedef struct cusparseDnMatDescr* cusparseDnMatDescr_t;
+
+typedef struct cusparseSpVecDescr const* cusparseConstSpVecDescr_t;
+typedef struct cusparseDnVecDescr const* cusparseConstDnVecDescr_t;
+typedef struct cusparseSpMatDescr const* cusparseConstSpMatDescr_t;
+typedef struct cusparseDnMatDescr const* cusparseConstDnMatDescr_t;
+
+// #############################################################################
+// # SPARSE VECTOR DESCRIPTOR
+// #############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateSpVec(cusparseSpVecDescr_t* spVecDescr,
+                    int64_t               size,
+                    int64_t               nnz,
+                    void*                 indices,
+                    void*                 values,
+                    cusparseIndexType_t   idxType,
+                    cusparseIndexBase_t   idxBase,
+                    cudaDataType          valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateConstSpVec(cusparseConstSpVecDescr_t* spVecDescr,
+                         int64_t                    size,
+                         int64_t                    nnz,
+                         const void*                indices,
+                         const void*                values,
+                         cusparseIndexType_t        idxType,
+                         cusparseIndexBase_t        idxBase,
+                         cudaDataType               valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpVec(cusparseConstSpVecDescr_t spVecDescr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGet(cusparseSpVecDescr_t spVecDescr,
+                 int64_t*             size,
+                 int64_t*             nnz,
+                 void**               indices,
+                 void**               values,
+                 cusparseIndexType_t* idxType,
+                 cusparseIndexBase_t* idxBase,
+                 cudaDataType*        valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstSpVecGet(cusparseConstSpVecDescr_t spVecDescr,
+                      int64_t*             size,
+                      int64_t*             nnz,
+                      const void**         indices,
+                      const void**         values,
+                      cusparseIndexType_t* idxType,
+                      cusparseIndexBase_t* idxBase,
+                      cudaDataType*        valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGetIndexBase(cusparseConstSpVecDescr_t spVecDescr,
+                          cusparseIndexBase_t*      idxBase);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecGetValues(cusparseSpVecDescr_t spVecDescr,
+                       void**               values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstSpVecGetValues(cusparseConstSpVecDescr_t spVecDescr,
+                            const void**              values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVecSetValues(cusparseSpVecDescr_t spVecDescr,
+                       void*                values);
+
+// #############################################################################
+// # DENSE VECTOR DESCRIPTOR
+// #############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateDnVec(cusparseDnVecDescr_t* dnVecDescr,
+                    int64_t               size,
+                    void*                 values,
+                    cudaDataType          valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateConstDnVec(cusparseConstDnVecDescr_t* dnVecDescr,
+                         int64_t                    size,
+                         const void*                values,
+                         cudaDataType               valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnVec(cusparseConstDnVecDescr_t dnVecDescr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecGet(cusparseDnVecDescr_t dnVecDescr,
+                 int64_t*             size,
+                 void**               values,
+                 cudaDataType*        valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstDnVecGet(cusparseConstDnVecDescr_t dnVecDescr,
+                      int64_t*                  size,
+                      const void**              values,
+                      cudaDataType*             valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecGetValues(cusparseDnVecDescr_t dnVecDescr,
+                       void**               values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstDnVecGetValues(cusparseConstDnVecDescr_t dnVecDescr,
+                            const void**              values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnVecSetValues(cusparseDnVecDescr_t dnVecDescr,
+                       void*                values);
+
+// #############################################################################
+// # SPARSE MATRIX DESCRIPTOR
+// #############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroySpMat(cusparseConstSpMatDescr_t spMatDescr);
+
+ cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetFormat(cusparseConstSpMatDescr_t spMatDescr,
+                       cusparseFormat_t*         format);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetIndexBase(cusparseConstSpMatDescr_t spMatDescr,
+                          cusparseIndexBase_t*      idxBase);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetValues(cusparseSpMatDescr_t spMatDescr,
+                       void**               values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstSpMatGetValues(cusparseConstSpMatDescr_t spMatDescr,
+                            const void**               values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetValues(cusparseSpMatDescr_t spMatDescr,
+                       void*                values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetSize(cusparseConstSpMatDescr_t spMatDescr,
+                     int64_t*                  rows,
+                     int64_t*                  cols,
+                     int64_t*                  nnz);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetStridedBatch(cusparseConstSpMatDescr_t spMatDescr,
+                             int*                      batchCount);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCooSetStridedBatch(cusparseSpMatDescr_t spMatDescr,
+                           int                  batchCount,
+                           int64_t              batchStride);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrSetStridedBatch(cusparseSpMatDescr_t spMatDescr,
+                           int                  batchCount,
+                           int64_t              offsetsBatchStride,
+                           int64_t              columnsValuesBatchStride);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseBsrSetStridedBatch(cusparseSpMatDescr_t spMatDescr,
+                           int                  batchCount,
+                           int64_t              offsetsBatchStride,
+                           int64_t              columnsBatchStride,
+                           int64_t              ValuesBatchStride);
+
+typedef enum {
+    CUSPARSE_SPMAT_FILL_MODE,
+    CUSPARSE_SPMAT_DIAG_TYPE
+} cusparseSpMatAttribute_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatGetAttribute(cusparseConstSpMatDescr_t spMatDescr,
+                          cusparseSpMatAttribute_t  attribute,
+                          void*                     data,
+                          size_t                    dataSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMatSetAttribute(cusparseSpMatDescr_t     spMatDescr,
+                          cusparseSpMatAttribute_t attribute,
+                          void*                    data,
+                          size_t                   dataSize);
+
+//------------------------------------------------------------------------------
+// ### CSR ###
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsr(cusparseSpMatDescr_t* spMatDescr,
+                  int64_t               rows,
+                  int64_t               cols,
+                  int64_t               nnz,
+                  void*                 csrRowOffsets,
+                  void*                 csrColInd,
+                  void*                 csrValues,
+                  cusparseIndexType_t   csrRowOffsetsType,
+                  cusparseIndexType_t   csrColIndType,
+                  cusparseIndexBase_t   idxBase,
+                  cudaDataType          valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateConstCsr(cusparseConstSpMatDescr_t* spMatDescr,
+                       int64_t                    rows,
+                       int64_t                    cols,
+                       int64_t                    nnz,
+                       const void*                csrRowOffsets,
+                       const void*                csrColInd,
+                       const void*                csrValues,
+                       cusparseIndexType_t        csrRowOffsetsType,
+                       cusparseIndexType_t        csrColIndType,
+                       cusparseIndexBase_t        idxBase,
+                       cudaDataType               valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCsc(cusparseSpMatDescr_t* spMatDescr,
+                  int64_t               rows,
+                  int64_t               cols,
+                  int64_t               nnz,
+                  void*                 cscColOffsets,
+                  void*                 cscRowInd,
+                  void*                 cscValues,
+                  cusparseIndexType_t   cscColOffsetsType,
+                  cusparseIndexType_t   cscRowIndType,
+                  cusparseIndexBase_t   idxBase,
+                  cudaDataType          valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateConstCsc(cusparseConstSpMatDescr_t* spMatDescr,
+                       int64_t                    rows,
+                       int64_t                    cols,
+                       int64_t                    nnz,
+                       const void*                cscColOffsets,
+                       const void*                cscRowInd,
+                       const void*                cscValues,
+                       cusparseIndexType_t        cscColOffsetsType,
+                       cusparseIndexType_t        cscRowIndType,
+                       cusparseIndexBase_t        idxBase,
+                       cudaDataType               valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrGet(cusparseSpMatDescr_t spMatDescr,
+               int64_t*             rows,
+               int64_t*             cols,
+               int64_t*             nnz,
+               void**               csrRowOffsets,
+               void**               csrColInd,
+               void**               csrValues,
+               cusparseIndexType_t* csrRowOffsetsType,
+               cusparseIndexType_t* csrColIndType,
+               cusparseIndexBase_t* idxBase,
+               cudaDataType*        valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstCsrGet(cusparseConstSpMatDescr_t spMatDescr,
+                    int64_t*                  rows,
+                    int64_t*                  cols,
+                    int64_t*                  nnz,
+                    const void**              csrRowOffsets,
+                    const void**              csrColInd,
+                    const void**              csrValues,
+                    cusparseIndexType_t*      csrRowOffsetsType,
+                    cusparseIndexType_t*      csrColIndType,
+                    cusparseIndexBase_t*      idxBase,
+                    cudaDataType*             valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCscGet(cusparseSpMatDescr_t spMatDescr,
+               int64_t*             rows,
+               int64_t*             cols,
+               int64_t*             nnz,
+               void**               cscColOffsets,
+               void**               cscRowInd,
+               void**               cscValues,
+               cusparseIndexType_t* cscColOffsetsType,
+               cusparseIndexType_t* cscRowIndType,
+               cusparseIndexBase_t* idxBase,
+               cudaDataType*        valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstCscGet(cusparseConstSpMatDescr_t spMatDescr,
+                    int64_t*                  rows,
+                    int64_t*                  cols,
+                    int64_t*                  nnz,
+                    const void**              cscColOffsets,
+                    const void**              cscRowInd,
+                    const void**              cscValues,
+                    cusparseIndexType_t*      cscColOffsetsType,
+                    cusparseIndexType_t*      cscRowIndType,
+                    cusparseIndexBase_t*      idxBase,
+                    cudaDataType*             valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCsrSetPointers(cusparseSpMatDescr_t spMatDescr,
+                       void*                csrRowOffsets,
+                       void*                csrColInd,
+                       void*                csrValues);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCscSetPointers(cusparseSpMatDescr_t spMatDescr,
+                       void*                cscColOffsets,
+                       void*                cscRowInd,
+                       void*                cscValues);
+
+//------------------------------------------------------------------------------
+// ### BSR ###
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBsr(cusparseSpMatDescr_t* spMatDescr,
+                  int64_t               brows,
+                  int64_t               bcols,
+                  int64_t               bnnz,
+                  int64_t               rowBlockSize,
+                  int64_t               colBlockSize,
+                  void*                 bsrRowOffsets,
+                  void*                 bsrColInd,
+                  void*                 bsrValues,
+                  cusparseIndexType_t   bsrRowOffsetsType,
+                  cusparseIndexType_t   bsrColIndType,
+                  cusparseIndexBase_t   idxBase,
+                  cudaDataType          valueType,
+                  cusparseOrder_t       order);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateConstBsr(cusparseConstSpMatDescr_t* spMatDescr,
+                       int64_t                    brows,
+                       int64_t                    bcols,
+                       int64_t                    bnnz,
+                       int64_t                    rowBlockDim,
+                       int64_t                    colBlockDim,
+                       const void*                bsrRowOffsets,
+                       const void*                bsrColInd,
+                       const void*                bsrValues,
+                       cusparseIndexType_t        bsrRowOffsetsType,
+                       cusparseIndexType_t        bsrColIndType,
+                       cusparseIndexBase_t        idxBase,
+                       cudaDataType               valueType,
+                       cusparseOrder_t            order);
+
+//------------------------------------------------------------------------------
+// ### COO ###
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateCoo(cusparseSpMatDescr_t* spMatDescr,
+                  int64_t               rows,
+                  int64_t               cols,
+                  int64_t               nnz,
+                  void*                 cooRowInd,
+                  void*                 cooColInd,
+                  void*                 cooValues,
+                  cusparseIndexType_t   cooIdxType,
+                  cusparseIndexBase_t   idxBase,
+                  cudaDataType          valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateConstCoo(cusparseConstSpMatDescr_t* spMatDescr,
+                       int64_t                    rows,
+                       int64_t                    cols,
+                       int64_t                    nnz,
+                       const void*                cooRowInd,
+                       const void*                cooColInd,
+                       const void*                cooValues,
+                       cusparseIndexType_t        cooIdxType,
+                       cusparseIndexBase_t        idxBase,
+                       cudaDataType               valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCooGet(cusparseSpMatDescr_t spMatDescr,
+               int64_t*             rows,
+               int64_t*             cols,
+               int64_t*             nnz,
+               void**               cooRowInd,  // COO row indices
+               void**               cooColInd,  // COO column indices
+               void**               cooValues,  // COO values
+               cusparseIndexType_t* idxType,
+               cusparseIndexBase_t* idxBase,
+               cudaDataType*        valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstCooGet(cusparseConstSpMatDescr_t spMatDescr,
+                    int64_t*                  rows,
+                    int64_t*                  cols,
+                    int64_t*                  nnz,
+                    const void**              cooRowInd,  // COO row indices
+                    const void**              cooColInd,  // COO column indices
+                    const void**              cooValues,  // COO values
+                    cusparseIndexType_t*      idxType,
+                    cusparseIndexBase_t*      idxBase,
+                    cudaDataType*             valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCooSetPointers(cusparseSpMatDescr_t spMatDescr,
+                       void*                cooRows,
+                       void*                cooColumns,
+                       void*                cooValues);
+
+//------------------------------------------------------------------------------
+// ### BLOCKED ELL ###
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateBlockedEll(cusparseSpMatDescr_t* spMatDescr,
+                         int64_t               rows,
+                         int64_t               cols,
+                         int64_t               ellBlockSize,
+                         int64_t               ellCols,
+                         void*                 ellColInd,
+                         void*                 ellValue,
+                         cusparseIndexType_t   ellIdxType,
+                         cusparseIndexBase_t   idxBase,
+                         cudaDataType          valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateConstBlockedEll(cusparseConstSpMatDescr_t* spMatDescr,
+                              int64_t                    rows,
+                              int64_t                    cols,
+                              int64_t                    ellBlockSize,
+                              int64_t                    ellCols,
+                              const void*                ellColInd,
+                              const void*                ellValue,
+                              cusparseIndexType_t        ellIdxType,
+                              cusparseIndexBase_t        idxBase,
+                              cudaDataType               valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseBlockedEllGet(cusparseSpMatDescr_t spMatDescr,
+                      int64_t*             rows,
+                      int64_t*             cols,
+                      int64_t*             ellBlockSize,
+                      int64_t*             ellCols,
+                      void**               ellColInd,
+                      void**               ellValue,
+                      cusparseIndexType_t* ellIdxType,
+                      cusparseIndexBase_t* idxBase,
+                      cudaDataType*        valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstBlockedEllGet(cusparseConstSpMatDescr_t spMatDescr,
+                           int64_t*                  rows,
+                           int64_t*                  cols,
+                           int64_t*                  ellBlockSize,
+                           int64_t*                  ellCols,
+                           const void**              ellColInd,
+                           const void**              ellValue,
+                           cusparseIndexType_t*      ellIdxType,
+                           cusparseIndexBase_t*      idxBase,
+                           cudaDataType*             valueType);
+
+//------------------------------------------------------------------------------
+// ### Sliced ELLPACK ###
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateSlicedEll(cusparseSpMatDescr_t*   spMatDescr,
+                        int64_t                 rows,
+                        int64_t                 cols,
+                        int64_t                 nnz,
+                        int64_t                 sellValuesSize,
+                        int64_t                 sliceSize,
+	                void*                   sellSliceOffsets,
+                        void*                   sellColInd,
+                        void*                   sellValues,
+			cusparseIndexType_t     sellSliceOffsetsType,
+                        cusparseIndexType_t     sellColIndType,
+                        cusparseIndexBase_t     idxBase,
+                        cudaDataType            valueType);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateConstSlicedEll(cusparseConstSpMatDescr_t* spMatDescr,
+                             int64_t                    rows,
+                             int64_t                    cols,
+                             int64_t                    nnz,
+                             int64_t                    sellValuesSize,
+                             int64_t                    sliceSize,
+                             const void*                sellSliceOffsets,
+                             const void*                sellColInd,
+                             const void*                sellValues,
+                             cusparseIndexType_t        sellSliceOffsetsType,
+                             cusparseIndexType_t        sellColIndType,
+                             cusparseIndexBase_t        idxBase,
+                             cudaDataType               valueType);
+
+// #############################################################################
+// # DENSE MATRIX DESCRIPTOR
+// #############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateDnMat(cusparseDnMatDescr_t* dnMatDescr,
+                    int64_t               rows,
+                    int64_t               cols,
+                    int64_t               ld,
+                    void*                 values,
+                    cudaDataType          valueType,
+                    cusparseOrder_t       order);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseCreateConstDnMat(cusparseConstDnMatDescr_t* dnMatDescr,
+                         int64_t                    rows,
+                         int64_t                    cols,
+                         int64_t                    ld,
+                         const void*                values,
+                         cudaDataType               valueType,
+                         cusparseOrder_t            order);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDestroyDnMat(cusparseConstDnMatDescr_t dnMatDescr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGet(cusparseDnMatDescr_t dnMatDescr,
+                 int64_t*             rows,
+                 int64_t*             cols,
+                 int64_t*             ld,
+                 void**               values,
+                 cudaDataType*        type,
+                 cusparseOrder_t*     order);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstDnMatGet(cusparseConstDnMatDescr_t dnMatDescr,
+                      int64_t*                  rows,
+                      int64_t*                  cols,
+                      int64_t*                  ld,
+                      const void**              values,
+                      cudaDataType*             type,
+                      cusparseOrder_t*          order);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGetValues(cusparseDnMatDescr_t dnMatDescr,
+                       void**               values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseConstDnMatGetValues(cusparseConstDnMatDescr_t dnMatDescr,
+                            const void**              values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatSetValues(cusparseDnMatDescr_t dnMatDescr,
+                       void*                values);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatSetStridedBatch(cusparseDnMatDescr_t dnMatDescr,
+                             int                  batchCount,
+                             int64_t              batchStride);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDnMatGetStridedBatch(cusparseConstDnMatDescr_t dnMatDescr,
+                             int*                      batchCount,
+                             int64_t*                  batchStride);
+
+// #############################################################################
+// # VECTOR-VECTOR OPERATIONS
+// #############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseAxpby(cusparseHandle_t          handle,
+              const void*               alpha,
+              cusparseConstSpVecDescr_t vecX,
+              const void*               beta,
+              cusparseDnVecDescr_t      vecY);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseGather(cusparseHandle_t          handle,
+               cusparseConstDnVecDescr_t vecY,
+               cusparseSpVecDescr_t      vecX);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseScatter(cusparseHandle_t          handle,
+                cusparseConstSpVecDescr_t vecX,
+                cusparseDnVecDescr_t      vecY);
+
+CUSPARSE_DEPRECATED
+cusparseStatus_t CUSPARSEAPI
+cusparseRot(cusparseHandle_t     handle,
+            const void*          c_coeff,
+            const void*          s_coeff,
+            cusparseSpVecDescr_t vecX,
+            cusparseDnVecDescr_t vecY);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVV_bufferSize(cusparseHandle_t          handle,
+                        cusparseOperation_t       opX,
+                        cusparseConstSpVecDescr_t vecX,
+                        cusparseConstDnVecDescr_t vecY,
+                        const void*               result,
+                        cudaDataType              computeType,
+                        size_t*                   bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpVV(cusparseHandle_t          handle,
+             cusparseOperation_t       opX,
+             cusparseConstSpVecDescr_t vecX,
+             cusparseConstDnVecDescr_t vecY,
+             void*                     result,
+             cudaDataType              computeType,
+             void*                     externalBuffer);
+
+// #############################################################################
+// # SPARSE TO DENSE
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_SPARSETODENSE_ALG_DEFAULT = 0
+} cusparseSparseToDenseAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSparseToDense_bufferSize(cusparseHandle_t           handle,
+                                 cusparseConstSpMatDescr_t  matA,
+                                 cusparseDnMatDescr_t       matB,
+                                 cusparseSparseToDenseAlg_t alg,
+                                 size_t*                    bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSparseToDense(cusparseHandle_t           handle,
+                      cusparseConstSpMatDescr_t  matA,
+                      cusparseDnMatDescr_t       matB,
+                      cusparseSparseToDenseAlg_t alg,
+                      void*                      externalBuffer);
+
+// #############################################################################
+// # DENSE TO SPARSE
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_DENSETOSPARSE_ALG_DEFAULT = 0
+} cusparseDenseToSparseAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDenseToSparse_bufferSize(cusparseHandle_t           handle,
+                                 cusparseConstDnMatDescr_t  matA,
+                                 cusparseSpMatDescr_t       matB,
+                                 cusparseDenseToSparseAlg_t alg,
+                                 size_t*                    bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDenseToSparse_analysis(cusparseHandle_t           handle,
+                               cusparseConstDnMatDescr_t  matA,
+                               cusparseSpMatDescr_t       matB,
+                               cusparseDenseToSparseAlg_t alg,
+                               void*                      externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseDenseToSparse_convert(cusparseHandle_t           handle,
+                              cusparseConstDnMatDescr_t  matA,
+                              cusparseSpMatDescr_t       matB,
+                              cusparseDenseToSparseAlg_t alg,
+                              void*                      externalBuffer);
+
+// #############################################################################
+// # SPARSE MATRIX-VECTOR MULTIPLICATION
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_SPMV_ALG_DEFAULT = 0,
+    CUSPARSE_SPMV_CSR_ALG1    = 2,
+    CUSPARSE_SPMV_CSR_ALG2    = 3,
+    CUSPARSE_SPMV_COO_ALG1    = 1,
+    CUSPARSE_SPMV_COO_ALG2    = 4,
+    CUSPARSE_SPMV_SELL_ALG1   = 5
+} cusparseSpMVAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMV(cusparseHandle_t          handle,
+             cusparseOperation_t       opA,
+             const void*               alpha,
+             cusparseConstSpMatDescr_t matA,
+             cusparseConstDnVecDescr_t vecX,
+             const void*               beta,
+             cusparseDnVecDescr_t      vecY,
+             cudaDataType              computeType,
+             cusparseSpMVAlg_t         alg,
+             void*                     externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMV_bufferSize(cusparseHandle_t          handle,
+                        cusparseOperation_t       opA,
+                        const void*               alpha,
+                        cusparseConstSpMatDescr_t matA,
+                        cusparseConstDnVecDescr_t vecX,
+                        const void*               beta,
+                        cusparseDnVecDescr_t      vecY,
+                        cudaDataType              computeType,
+                        cusparseSpMVAlg_t         alg,
+                        size_t*                   bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMV_preprocess(cusparseHandle_t          handle,
+                        cusparseOperation_t       opA,
+                        const void*               alpha,
+                        cusparseConstSpMatDescr_t matA,
+                        cusparseConstDnVecDescr_t vecX,
+                        const void*               beta,
+                        cusparseDnVecDescr_t      vecY,
+                        cudaDataType              computeType,
+                        cusparseSpMVAlg_t         alg,
+                        void*                     externalBuffer);
+// #############################################################################
+// # SPARSE TRIANGULAR VECTOR SOLVE
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_SPSV_ALG_DEFAULT = 0,
+} cusparseSpSVAlg_t;
+
+typedef enum {
+    CUSPARSE_SPSV_UPDATE_GENERAL  = 0,
+    CUSPARSE_SPSV_UPDATE_DIAGONAL = 1
+} cusparseSpSVUpdate_t;
+
+struct cusparseSpSVDescr;
+typedef struct cusparseSpSVDescr* cusparseSpSVDescr_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_createDescr(cusparseSpSVDescr_t* descr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_destroyDescr(cusparseSpSVDescr_t descr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_bufferSize(cusparseHandle_t          handle,
+                        cusparseOperation_t       opA,
+                        const void*               alpha,
+                        cusparseConstSpMatDescr_t matA,
+                        cusparseConstDnVecDescr_t vecX,
+                        cusparseDnVecDescr_t      vecY,
+                        cudaDataType              computeType,
+                        cusparseSpSVAlg_t         alg,
+                        cusparseSpSVDescr_t       spsvDescr,
+                        size_t*                   bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_analysis(cusparseHandle_t          handle,
+                      cusparseOperation_t       opA,
+                      const void*               alpha,
+                      cusparseConstSpMatDescr_t matA,
+                      cusparseConstDnVecDescr_t vecX,
+                      cusparseDnVecDescr_t      vecY,
+                      cudaDataType              computeType,
+                      cusparseSpSVAlg_t         alg,
+                      cusparseSpSVDescr_t       spsvDescr,
+                      void*                     externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_solve(cusparseHandle_t          handle,
+                   cusparseOperation_t       opA,
+                   const void*               alpha,
+                   cusparseConstSpMatDescr_t matA,
+                   cusparseConstDnVecDescr_t vecX,
+                   cusparseDnVecDescr_t      vecY,
+                   cudaDataType              computeType,
+                   cusparseSpSVAlg_t         alg,
+                   cusparseSpSVDescr_t       spsvDescr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSV_updateMatrix(cusparseHandle_t      handle,
+				          cusparseSpSVDescr_t   spsvDescr,
+                          void*                 newValues,
+                          cusparseSpSVUpdate_t  updatePart);
+
+
+
+// #############################################################################
+// # SPARSE TRIANGULAR MATRIX SOLVE
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_SPSM_ALG_DEFAULT = 0,
+} cusparseSpSMAlg_t;
+
+typedef enum {
+    CUSPARSE_SPSM_UPDATE_GENERAL  = 0,
+    CUSPARSE_SPSM_UPDATE_DIAGONAL = 1
+} cusparseSpSMUpdate_t;
+
+struct cusparseSpSMDescr;
+typedef struct cusparseSpSMDescr* cusparseSpSMDescr_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_createDescr(cusparseSpSMDescr_t* descr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_destroyDescr(cusparseSpSMDescr_t descr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_bufferSize(cusparseHandle_t          handle,
+                        cusparseOperation_t       opA,
+                        cusparseOperation_t       opB,
+                        const void*               alpha,
+                        cusparseConstSpMatDescr_t matA,
+                        cusparseConstDnMatDescr_t matB,
+                        cusparseDnMatDescr_t      matC,
+                        cudaDataType              computeType,
+                        cusparseSpSMAlg_t         alg,
+                        cusparseSpSMDescr_t       spsmDescr,
+                        size_t*                   bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_analysis(cusparseHandle_t          handle,
+                      cusparseOperation_t       opA,
+                      cusparseOperation_t       opB,
+                      const void*               alpha,
+                      cusparseConstSpMatDescr_t matA,
+                      cusparseConstDnMatDescr_t matB,
+                      cusparseDnMatDescr_t      matC,
+                      cudaDataType              computeType,
+                      cusparseSpSMAlg_t         alg,
+                      cusparseSpSMDescr_t       spsmDescr,
+                      void*                     externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_solve(cusparseHandle_t          handle,
+                   cusparseOperation_t       opA,
+                   cusparseOperation_t       opB,
+                   const void*               alpha,
+                   cusparseConstSpMatDescr_t matA,
+                   cusparseConstDnMatDescr_t matB,
+                   cusparseDnMatDescr_t      matC,
+                   cudaDataType              computeType,
+                   cusparseSpSMAlg_t         alg,
+                   cusparseSpSMDescr_t       spsmDescr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpSM_updateMatrix(cusparseHandle_t      handle,
+				          cusparseSpSMDescr_t   spsmDescr,
+                          void*                 newValues,
+                          cusparseSpSMUpdate_t  updatePart);
+
+// #############################################################################
+// # SPARSE MATRIX-MATRIX MULTIPLICATION
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_SPMM_ALG_DEFAULT      = 0,
+    CUSPARSE_SPMM_COO_ALG1         = 1,
+    CUSPARSE_SPMM_COO_ALG2         = 2,
+    CUSPARSE_SPMM_COO_ALG3         = 3,
+    CUSPARSE_SPMM_COO_ALG4         = 5,
+    CUSPARSE_SPMM_CSR_ALG1         = 4,
+    CUSPARSE_SPMM_CSR_ALG2         = 6,
+    CUSPARSE_SPMM_CSR_ALG3         = 12,
+    CUSPARSE_SPMM_BLOCKED_ELL_ALG1 = 13
+} cusparseSpMMAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMM_bufferSize(cusparseHandle_t          handle,
+                        cusparseOperation_t       opA,
+                        cusparseOperation_t       opB,
+                        const void*               alpha,
+                        cusparseConstSpMatDescr_t matA,
+                        cusparseConstDnMatDescr_t matB,
+                        const void*               beta,
+                        cusparseDnMatDescr_t      matC,
+                        cudaDataType              computeType,
+                        cusparseSpMMAlg_t         alg,
+                        size_t*                   bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMM_preprocess(cusparseHandle_t          handle,
+                        cusparseOperation_t       opA,
+                        cusparseOperation_t       opB,
+                        const void*               alpha,
+                        cusparseConstSpMatDescr_t matA,
+                        cusparseConstDnMatDescr_t matB,
+                        const void*               beta,
+                        cusparseDnMatDescr_t      matC,
+                        cudaDataType              computeType,
+                        cusparseSpMMAlg_t         alg,
+                        void*                     externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMM(cusparseHandle_t          handle,
+             cusparseOperation_t       opA,
+             cusparseOperation_t       opB,
+             const void*               alpha,
+             cusparseConstSpMatDescr_t matA,
+             cusparseConstDnMatDescr_t matB,
+             const void*               beta,
+             cusparseDnMatDescr_t      matC,
+             cudaDataType              computeType,
+             cusparseSpMMAlg_t         alg,
+             void*                     externalBuffer);
+
+// #############################################################################
+// # SPARSE MATRIX - SPARSE MATRIX MULTIPLICATION (SpGEMM)
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_SPGEMM_DEFAULT                 = 0,
+    CUSPARSE_SPGEMM_CSR_ALG_DETERMINITIC    = 1,
+    CUSPARSE_SPGEMM_CSR_ALG_NONDETERMINITIC = 2,
+    CUSPARSE_SPGEMM_ALG1                    = 3,
+    CUSPARSE_SPGEMM_ALG2                    = 4,
+    CUSPARSE_SPGEMM_ALG3                    = 5
+} cusparseSpGEMMAlg_t;
+
+struct cusparseSpGEMMDescr;
+typedef struct cusparseSpGEMMDescr* cusparseSpGEMMDescr_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_createDescr(cusparseSpGEMMDescr_t* descr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_destroyDescr(cusparseSpGEMMDescr_t descr);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_workEstimation(cusparseHandle_t          handle,
+                              cusparseOperation_t       opA,
+                              cusparseOperation_t       opB,
+                              const void*               alpha,
+                              cusparseConstSpMatDescr_t matA,
+                              cusparseConstSpMatDescr_t matB,
+                              const void*               beta,
+                              cusparseSpMatDescr_t      matC,
+                              cudaDataType              computeType,
+                              cusparseSpGEMMAlg_t       alg,
+                              cusparseSpGEMMDescr_t     spgemmDescr,
+                              size_t*                   bufferSize1,
+                              void*                     externalBuffer1);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_getNumProducts(cusparseSpGEMMDescr_t spgemmDescr,
+                              int64_t*              num_prods);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_estimateMemory(cusparseHandle_t          handle,
+                              cusparseOperation_t       opA,
+                              cusparseOperation_t       opB,
+                              const void*               alpha,
+                              cusparseConstSpMatDescr_t matA,
+                              cusparseConstSpMatDescr_t matB,
+                              const void*               beta,
+                              cusparseSpMatDescr_t      matC,
+                              cudaDataType              computeType,
+                              cusparseSpGEMMAlg_t       alg,
+                              cusparseSpGEMMDescr_t     spgemmDescr,
+                              float                     chunk_fraction,
+                              size_t*                   bufferSize3,
+                              void*                     externalBuffer3,
+                              size_t*                   bufferSize2);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_compute(cusparseHandle_t          handle,
+                       cusparseOperation_t       opA,
+                       cusparseOperation_t       opB,
+                       const void*               alpha,
+                       cusparseConstSpMatDescr_t matA,
+                       cusparseConstSpMatDescr_t matB,
+                       const void*               beta,
+                       cusparseSpMatDescr_t      matC,
+                       cudaDataType              computeType,
+                       cusparseSpGEMMAlg_t       alg,
+                       cusparseSpGEMMDescr_t     spgemmDescr,
+                       size_t*                   bufferSize2,
+                       void*                     externalBuffer2);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMM_copy(cusparseHandle_t          handle,
+                    cusparseOperation_t       opA,
+                    cusparseOperation_t       opB,
+                    const void*               alpha,
+                    cusparseConstSpMatDescr_t matA,
+                    cusparseConstSpMatDescr_t matB,
+                    const void*               beta,
+                    cusparseSpMatDescr_t      matC,
+                    cudaDataType              computeType,
+                    cusparseSpGEMMAlg_t       alg,
+                    cusparseSpGEMMDescr_t     spgemmDescr);
+
+// #############################################################################
+// # SPARSE MATRIX - SPARSE MATRIX MULTIPLICATION (SpGEMM) STRUCTURE REUSE
+// #############################################################################
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMMreuse_workEstimation(cusparseHandle_t          handle,
+                                   cusparseOperation_t       opA,
+                                   cusparseOperation_t       opB,
+                                   cusparseConstSpMatDescr_t matA,
+                                   cusparseConstSpMatDescr_t matB,
+                                   cusparseSpMatDescr_t      matC,
+                                   cusparseSpGEMMAlg_t       alg,
+                                   cusparseSpGEMMDescr_t     spgemmDescr,
+                                   size_t*                   bufferSize1,
+                                   void*                     externalBuffer1);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMMreuse_nnz(cusparseHandle_t          handle,
+                        cusparseOperation_t       opA,
+                        cusparseOperation_t       opB,
+                        cusparseConstSpMatDescr_t matA,
+                        cusparseConstSpMatDescr_t matB,
+                        cusparseSpMatDescr_t      matC,
+                        cusparseSpGEMMAlg_t       alg,
+                        cusparseSpGEMMDescr_t     spgemmDescr,
+                        size_t*                   bufferSize2,
+                        void*                     externalBuffer2,
+                        size_t*                   bufferSize3,
+                        void*                     externalBuffer3,
+                        size_t*                   bufferSize4,
+                        void*                     externalBuffer4);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMMreuse_copy(cusparseHandle_t          handle,
+                         cusparseOperation_t       opA,
+                         cusparseOperation_t       opB,
+                         cusparseConstSpMatDescr_t matA,
+                         cusparseConstSpMatDescr_t matB,
+                         cusparseSpMatDescr_t      matC,
+                         cusparseSpGEMMAlg_t       alg,
+                         cusparseSpGEMMDescr_t     spgemmDescr,
+                         size_t*                   bufferSize5,
+                         void*                     externalBuffer5);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpGEMMreuse_compute(cusparseHandle_t          handle,
+                            cusparseOperation_t       opA,
+                            cusparseOperation_t       opB,
+                            const void*               alpha,
+                            cusparseConstSpMatDescr_t matA,
+                            cusparseConstSpMatDescr_t matB,
+                            const void*               beta,
+                            cusparseSpMatDescr_t      matC,
+                            cudaDataType              computeType,
+                            cusparseSpGEMMAlg_t       alg,
+                            cusparseSpGEMMDescr_t     spgemmDescr);
+
+// #############################################################################
+// # SAMPLED DENSE-DENSE MATRIX MULTIPLICATION
+// #############################################################################
+
+typedef enum {
+    CUSPARSE_SDDMM_ALG_DEFAULT = 0
+} cusparseSDDMMAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSDDMM_bufferSize(cusparseHandle_t          handle,
+                         cusparseOperation_t       opA,
+                         cusparseOperation_t       opB,
+                         const void*               alpha,
+                         cusparseConstDnMatDescr_t matA,
+                         cusparseConstDnMatDescr_t matB,
+                         const void*               beta,
+                         cusparseSpMatDescr_t      matC,
+                         cudaDataType              computeType,
+                         cusparseSDDMMAlg_t        alg,
+                         size_t*                   bufferSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSDDMM_preprocess(cusparseHandle_t          handle,
+                         cusparseOperation_t       opA,
+                         cusparseOperation_t       opB,
+                         const void*               alpha,
+                         cusparseConstDnMatDescr_t matA,
+                         cusparseConstDnMatDescr_t matB,
+                         const void*               beta,
+                         cusparseSpMatDescr_t      matC,
+                         cudaDataType              computeType,
+                         cusparseSDDMMAlg_t        alg,
+                         void*                     externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSDDMM(cusparseHandle_t          handle,
+              cusparseOperation_t       opA,
+              cusparseOperation_t       opB,
+              const void*               alpha,
+              cusparseConstDnMatDescr_t matA,
+              cusparseConstDnMatDescr_t matB,
+              const void*               beta,
+              cusparseSpMatDescr_t      matC,
+              cudaDataType              computeType,
+              cusparseSDDMMAlg_t        alg,
+              void*                     externalBuffer);
+
+// #############################################################################
+// # GENERIC APIs WITH CUSTOM OPERATORS (PREVIEW)
+// #############################################################################
+
+struct cusparseSpMMOpPlan;
+typedef struct cusparseSpMMOpPlan*       cusparseSpMMOpPlan_t;
+
+typedef enum {
+    CUSPARSE_SPMM_OP_ALG_DEFAULT
+} cusparseSpMMOpAlg_t;
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMMOp_createPlan(cusparseHandle_t          handle,
+                          cusparseSpMMOpPlan_t*     plan,
+                          cusparseOperation_t       opA,
+                          cusparseOperation_t       opB,
+                          cusparseConstSpMatDescr_t matA,
+                          cusparseConstDnMatDescr_t matB,
+                          cusparseDnMatDescr_t      matC,
+                          cudaDataType              computeType,
+                          cusparseSpMMOpAlg_t       alg,
+                          const void*               addOperationNvvmBuffer,
+                          size_t                    addOperationBufferSize,
+                          const void*               mulOperationNvvmBuffer,
+                          size_t                    mulOperationBufferSize,
+                          const void*               epilogueNvvmBuffer,
+                          size_t                    epilogueBufferSize,
+                          size_t*                   SpMMWorkspaceSize);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMMOp(cusparseSpMMOpPlan_t plan,
+               void*                externalBuffer);
+
+cusparseStatus_t CUSPARSEAPI
+cusparseSpMMOp_destroyPlan(cusparseSpMMOpPlan_t plan);
+
+//------------------------------------------------------------------------------
+
+#if defined(__cplusplus)
+} // extern "C"
+#endif // defined(__cplusplus)
+
+#undef CUSPARSE_DEPRECATED_REPLACE_WITH
+#undef CUSPARSE_DEPRECATED
+#undef CUSPARSE_DEPRECATED_TYPE
+#undef CUSPARSE_DEPRECATED_TYPE_MSVC
+#undef CUSPARSE_DEPRECATED_ENUM_REPLACE_WITH
+#undef CUSPARSE_DEPRECATED_ENUM
+
+#endif // !defined(CUSPARSE_H_)
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse_v2.h b/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse_v2.h
new file mode 100644
index 0000000000000000000000000000000000000000..f889e1f569d46d1116fe6e302429b3855de43c21
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/cusparse/include/cusparse_v2.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 1993-2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(CUSPARSE_V2_H_)
+#define CUSPARSE_V2_H_
+
+#include "cusparse.h"
+
+#endif
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06ae7fc0a4907be9d244a49c9a970a012b7c7a52
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/cusparse/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nccl/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/nccl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nccl/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/nccl/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82af1065336868b19d75cd74e0156bc0fe72ba46
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/nccl/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce696afda66408fd395a022478220730b4bf725c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/nccl/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h b/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3d04b723ddf0b6f574316ae49bc52297effff02
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl.h
@@ -0,0 +1,439 @@
+/*************************************************************************
+ * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_H_
+#define NCCL_H_
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#if CUDART_VERSION >= 11000
+#include <cuda_bf16.h>
+#endif
+
+#define NCCL_MAJOR 2
+#define NCCL_MINOR 21
+#define NCCL_PATCH 5
+#define NCCL_SUFFIX ""
+
+#define NCCL_VERSION_CODE 22105
+#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <limits.h>
+/* Opaque handle to communicator */
+typedef struct ncclComm* ncclComm_t;
+#define NCCL_COMM_NULL NULL
+
+#define NCCL_UNIQUE_ID_BYTES 128
+typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
+
+/* Error type */
+typedef enum { ncclSuccess                 =  0,
+               ncclUnhandledCudaError      =  1,
+               ncclSystemError             =  2,
+               ncclInternalError           =  3,
+               ncclInvalidArgument         =  4,
+               ncclInvalidUsage            =  5,
+               ncclRemoteError             =  6,
+               ncclInProgress              =  7,
+               ncclNumResults              =  8 } ncclResult_t;
+
+#define NCCL_CONFIG_UNDEF_INT INT_MIN
+#define NCCL_CONFIG_UNDEF_PTR NULL
+#define NCCL_SPLIT_NOCOLOR -1
+
+/* Communicator configuration. Users can assign value to attributes to specify the
+ * behavior of a communicator. */
+typedef struct ncclConfig_v21700 {
+  /* attributes that users should never touch. */
+  size_t size;
+  unsigned int magic;
+  unsigned int version;
+  /* attributes that users are able to customize. */
+  int blocking;
+  int cgaClusterSize;
+  int minCTAs;
+  int maxCTAs;
+  const char *netName;
+  int splitShare;
+} ncclConfig_t;
+
+/* Config initializer must be assigned to initialize config structure when it is created.
+ * Not initialized config will result in NCCL error. */
+#define NCCL_CONFIG_INITIALIZER {                                       \
+  sizeof(ncclConfig_t), /* size */                                      \
+  0xcafebeef,           /* magic */                                     \
+  NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */       \
+  NCCL_CONFIG_UNDEF_INT,                    /* blocking */              \
+  NCCL_CONFIG_UNDEF_INT,                    /* cgaClusterSize */        \
+  NCCL_CONFIG_UNDEF_INT,                    /* minCTAs */               \
+  NCCL_CONFIG_UNDEF_INT,                    /* maxCTAs */               \
+  NCCL_CONFIG_UNDEF_PTR,                    /* netName */               \
+  NCCL_CONFIG_UNDEF_INT                     /* splitShare */            \
+}
+
+/* NCCL malloc and free function for all types of NCCL optimizations
+ * (e.g. user buffer registration). The actual allocated size might
+ * be larger than requested due to granularity requirement. */
+ncclResult_t  ncclMemAlloc(void** ptr, size_t size);
+ncclResult_t pncclMemAlloc(void** ptr, size_t size);
+
+ncclResult_t  ncclMemFree(void *ptr);
+ncclResult_t pncclMemFree(void *ptr);
+
+/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer.
+ * This integer is coded with the MAJOR, MINOR and PATCH level of the
+ * NCCL library
+ */
+ncclResult_t  ncclGetVersion(int *version);
+ncclResult_t pncclGetVersion(int *version);
+
+/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be
+ * called once and the Id should be distributed to all ranks in the
+ * communicator before calling ncclCommInitRank. */
+ncclResult_t  ncclGetUniqueId(ncclUniqueId* uniqueId);
+ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId);
+
+/* Create a new communicator (multi thread/process version) with a configuration
+ * set by users. */
+ncclResult_t  ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config);
+
+/* Creates a new communicator (multi thread/process version).
+ * rank must be between 0 and nranks-1 and unique within a communicator clique.
+ * Each rank is associated to a CUDA device, which has to be set before calling
+ * ncclCommInitRank.
+ * ncclCommInitRank implicitly syncronizes with other ranks, so it must be
+ * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */
+ncclResult_t  ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank);
+
+/* Creates a clique of communicators (single process version).
+ * This is a convenience function to create a single-process communicator clique.
+ * Returns an array of ndev newly initialized communicators in comm.
+ * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t).
+ * If devlist is NULL, the first ndev CUDA devices are used.
+ * Order of devlist defines user-order of processors within the communicator. */
+ncclResult_t  ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist);
+
+/* Finalize a communicator. ncclCommFinalize flushes all issued communications,
+ * and marks communicator state as ncclInProgress. The state will change to ncclSuccess
+ * when the communicator is globally quiescent and related resources are freed; then,
+ * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator
+ * itself) without blocking. */
+ncclResult_t  ncclCommFinalize(ncclComm_t comm);
+ncclResult_t pncclCommFinalize(ncclComm_t comm);
+
+/* Frees local resources associated with communicator object. */
+ncclResult_t  ncclCommDestroy(ncclComm_t comm);
+ncclResult_t pncclCommDestroy(ncclComm_t comm);
+
+/* Frees resources associated with communicator object and aborts any operations
+ * that might still be running on the device. */
+ncclResult_t  ncclCommAbort(ncclComm_t comm);
+ncclResult_t pncclCommAbort(ncclComm_t comm);
+
+/* Creates one or more communicators from an existing one.
+ * Ranks with the same color will end up in the same communicator.
+ * Within the new communicator, key will be used to order ranks.
+ * NCCL_SPLIT_NOCOLOR as color will indicate the rank will not be part of any group
+ * and will therefore return a NULL communicator.
+ * If config is NULL, the new communicator will inherit the original communicator's
+ * configuration*/
+ncclResult_t  ncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+ncclResult_t pncclCommSplit(ncclComm_t comm, int color, int key, ncclComm_t *newcomm, ncclConfig_t* config);
+
+/* Returns a string for each error code. */
+const char*  ncclGetErrorString(ncclResult_t result);
+const char* pncclGetErrorString(ncclResult_t result);
+
+/* Returns a human-readable message of the last error that occurred. */
+const char*  ncclGetLastError(ncclComm_t comm);
+const char* pncclGetLastError(ncclComm_t comm);
+
+/* Checks whether the comm has encountered any asynchronous errors */
+ncclResult_t  ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError);
+
+/* Gets the number of ranks in the communicator clique. */
+ncclResult_t  ncclCommCount(const ncclComm_t comm, int* count);
+ncclResult_t pncclCommCount(const ncclComm_t comm, int* count);
+
+/* Returns the cuda device number associated with the communicator. */
+ncclResult_t  ncclCommCuDevice(const ncclComm_t comm, int* device);
+ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device);
+
+/* Returns the user-ordered "rank" associated with the communicator. */
+ncclResult_t  ncclCommUserRank(const ncclComm_t comm, int* rank);
+ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank);
+
+/* Register CUDA buffer for zero-copy operation */
+ncclResult_t  ncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+ncclResult_t pncclCommRegister(const ncclComm_t comm, void* buff, size_t size, void** handle);
+
+/* Deregister CUDA buffer */
+ncclResult_t  ncclCommDeregister(const ncclComm_t comm, void* handle);
+ncclResult_t pncclCommDeregister(const ncclComm_t comm, void* handle);
+
+/* Reduction operation selector */
+typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t;
+typedef enum { ncclSum        = 0,
+               ncclProd       = 1,
+               ncclMax        = 2,
+               ncclMin        = 3,
+               ncclAvg        = 4,
+               /* ncclNumOps: The number of built-in ncclRedOp_t values. Also
+                * serves as the least possible value for dynamic ncclRedOp_t's
+                * as constructed by ncclRedOpCreate*** functions. */
+               ncclNumOps     = 5,
+               /* ncclMaxRedOp: The largest valid value for ncclRedOp_t.
+                * It is defined to be the largest signed value (since compilers
+                * are permitted to use signed enums) that won't grow
+                * sizeof(ncclRedOp_t) when compared to previous NCCL versions to
+                * maintain ABI compatibility. */
+               ncclMaxRedOp   = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t))
+             } ncclRedOp_t;
+
+/* Data types */
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+#if defined(__CUDA_BF16_TYPES_EXIST__)
+               ncclBfloat16   = 9,
+               ncclNumTypes   = 10
+#else
+               ncclNumTypes   = 9
+#endif
+} ncclDataType_t;
+
+/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */
+typedef enum {
+  /* ncclScalarDevice: The scalar is in device-visible memory and will be
+   * dereferenced while the collective is running. */
+  ncclScalarDevice = 0,
+
+  /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be
+   * dereferenced before the ncclRedOpCreate***() function returns. */
+  ncclScalarHostImmediate = 1
+} ncclScalarResidence_t;
+
+/*
+ * ncclRedOpCreatePreMulSum
+ *
+ * Creates a new reduction operator which pre-multiplies input values by a given
+ * scalar locally before reducing them with peer values via summation. For use
+ * only with collectives launched against *comm* and *datatype*. The
+ * *residence* argument indicates how/when the memory pointed to by *scalar*
+ * will be dereferenced. Upon return, the newly created operator's handle
+ * is stored in *op*.
+ */
+ncclResult_t  ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm);
+
+/*
+ * ncclRedOpDestroy
+ *
+ * Destroys the reduction operator *op*. The operator must have been created by
+ * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be
+ * destroyed as soon as the last NCCL function which is given that operator returns.
+ */
+ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
+ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm);
+
+/*
+ * Collective communication operations
+ *
+ * Collective communication operations must be called separately for each
+ * communicator in a communicator clique.
+ *
+ * They return when operations have been enqueued on the CUDA stream.
+ *
+ * Since they may perform inter-CPU synchronization, each call has to be done
+ * from a different thread or process, or need to use Group Semantics (see
+ * below).
+ */
+
+/*
+ * Reduce
+ *
+ * Reduces data arrays of length count in sendbuff into recvbuff using op
+ * operation.
+ * recvbuff may be NULL on all calls except for root device.
+ * root is the rank (not the CUDA device) where data will reside after the
+ * operation is complete.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype,
+    ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * (deprecated) Broadcast (in-place)
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * This operation is implicitely in place.
+ */
+ncclResult_t  ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Broadcast
+ *
+ * Copies count values from root to all other devices.
+ * root is the rank (not the CUDA device) where data resides before the
+ * operation is started.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * All-Reduce
+ *
+ * Reduces data arrays of length count in sendbuff using op operation, and
+ * leaves identical copies of result on each recvbuff.
+ *
+ * In-place operation will happen if sendbuff == recvbuff.
+ */
+ncclResult_t  ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count,
+    ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Reduce-Scatter
+ *
+ * Reduces data in sendbuff using op operation and leaves reduced result
+ * scattered over the devices so that recvbuff on rank i will contain the i-th
+ * block of the result.
+ * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff
+ * should have a size of at least nranks*recvcount elements.
+ *
+ * In-place operations will happen if recvbuff == sendbuff + rank * recvcount.
+ */
+ncclResult_t  ncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff,
+    size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
+    cudaStream_t stream);
+
+/*
+ * All-Gather
+ *
+ * Each device gathers sendcount values from other GPUs into recvbuff,
+ * receiving data from rank i at offset i*sendcount.
+ * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff
+ * should have a size of at least nranks*sendcount elements.
+ *
+ * In-place operations will happen if sendbuff == recvbuff + rank * sendcount.
+ */
+ncclResult_t  ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount,
+    ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Send
+ *
+ * Send data from sendbuff to rank peer.
+ *
+ * Rank peer needs to call ncclRecv with the same datatype and the same count from this
+ * rank.
+ *
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+ * ncclGroupEnd section.
+ */
+ncclResult_t  ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Receive
+ *
+ * Receive data from rank peer into recvbuff.
+ *
+ * Rank peer needs to call ncclSend with the same datatype and the same count to this
+ * rank.
+ *
+ * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations
+ * need to progress concurrently to complete, they must be fused within a ncclGroupStart/
+ * ncclGroupEnd section.
+ */
+ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+ncclResult_t  ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer,
+    ncclComm_t comm, cudaStream_t stream);
+
+/*
+ * Group semantics
+ *
+ * When managing multiple GPUs from a single thread, and since NCCL collective
+ * calls may perform inter-CPU synchronization, we need to "group" calls for
+ * different ranks/devices into a single call.
+ *
+ * Grouping NCCL calls as being part of the same collective operation is done
+ * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all
+ * collective calls until the ncclGroupEnd call, which will wait for all calls
+ * to be complete. Note that for collective communication, ncclGroupEnd only
+ * guarantees that the operations are enqueued on the streams, not that
+ * the operation is effectively done.
+ *
+ * Both collective communication and ncclCommInitRank can be used in conjunction
+ * of ncclGroupStart/ncclGroupEnd, but not together.
+ *
+ * Group semantics also allow to fuse multiple operations on the same device
+ * to improve performance (for aggregated collective calls), or to permit
+ * concurrent progress of multiple send/receive operations.
+ */
+
+/*
+ * Group Start
+ *
+ * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into
+ * a single NCCL operation. Nothing will be started on the CUDA stream until
+ * ncclGroupEnd.
+ */
+ncclResult_t  ncclGroupStart();
+ncclResult_t pncclGroupStart();
+
+/*
+ * Group End
+ *
+ * End a group call. Start a fused NCCL operation consisting of all calls since
+ * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations
+ * need to be called after ncclGroupEnd.
+ */
+ncclResult_t  ncclGroupEnd();
+ncclResult_t pncclGroupEnd();
+
+#ifdef __cplusplus
+} // end extern "C"
+#endif
+
+#endif // end include guard
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl_net.h b/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl_net.h
new file mode 100644
index 0000000000000000000000000000000000000000..467d9fdb89f546a0491e12b38f8fbe2d153f1c68
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/nccl/include/nccl_net.h
@@ -0,0 +1,456 @@
+/*************************************************************************
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved.
+ *
+ * See LICENSE.txt for license information
+ ************************************************************************/
+
+#ifndef NCCL_NET_H_
+#define NCCL_NET_H_
+
+#include "nccl.h"
+#include "nccl_common.h"
+#include "net_device.h"
+#include <stdint.h>
+
+#define NCCL_NET_HANDLE_MAXSIZE 128
+
+#define NCCL_PTR_HOST 0x1
+#define NCCL_PTR_CUDA 0x2
+#define NCCL_PTR_DMABUF 0x4
+
+// Maximum number of requests per comm object
+#define NCCL_NET_MAX_REQUESTS 32
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int regIsGlobal;                 // regMr is not tied to a particular comm
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v8_t;
+
+typedef ncclNetProperties_v8_t ncclNetProperties_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v8_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v8_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v8_t;
+
+typedef ncclNet_v8_t ncclNet_t;
+
+#define NCCL_NET_PLUGIN_SYMBOL ncclNetPlugin_v8
+
+typedef struct {
+  void* mhandle;
+  void* address;
+  uint32_t size;
+} ncclNetSGE_v8_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v8_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, size_t size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  ncclResult_t (*iallgather)(void* collComm, void* sendData, int nRecvParts, ncclNetSGE_v8_t* recvParts,
+                             size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                             void* sendMhandle, void** request);
+  ncclResult_t (*ireducescatter)(void* collComm, int nSendParts, ncclNetSGE_v8_t* sendParts, void* recvData,
+                                 size_t bytesPerRank, size_t windowOffset, size_t windowBytes,
+                                 ncclDataType_t dataType, ncclRedOp_t redOp,
+                                 void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v8_t;
+
+typedef ncclCollNet_v8_t ncclCollNet_t;
+
+#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v8
+
+typedef struct {
+  char* name;                      // Used mostly for logging.
+  char* pciPath;                   // Path to the PCI device in /sys.
+  uint64_t guid;                   // Unique identifier for the NIC chip. Important for
+                                   // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport;                  // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;                       // Port speed in Mbps.
+  int port;                        // Port number.
+  float latency;                   // Network latency
+  int maxComms;                    // Maximum number of comms we can create
+  int maxRecvs;                    // Maximum number of grouped receives.
+  ncclNetDeviceType netDeviceType; // Network offload type
+  int netDeviceVersion;            // Version number for network offload
+} ncclNetProperties_v7_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  // If *sendDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm, ncclNetDeviceHandle_v7_t** sendDevComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  // If *recvDevComm points to a valid object, then NCCL is requesting device offload for this connection
+  ncclResult_t (*accept)(void* listenComm, void** recvComm, ncclNetDeviceHandle_v7_t** recvDevComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+
+  // Copy the given mhandle to a dptr in a format usable by this plugin's device code
+  ncclResult_t (*getDeviceMr)(void* comm, void* mhandle, void** dptr_mhandle);
+
+  // Notify the plugin that a recv has completed by the device
+  ncclResult_t (*irecvConsumed)(void* recvComm, int n, void* request);
+} ncclNet_v7_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v7_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v7_t;
+
+#define NCCL_NET_MAX_REQUESTS_V6 8
+
+// v6 struct for backwards compatibility
+typedef struct {
+  char* name;     // Used mostly for logging.
+  char* pciPath;  // Path to the PCI device in /sys.
+  uint64_t guid;  // Unique identifier for the NIC chip. Important for
+                  // cards with multiple PCI functions (Physical or virtual).
+  int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF]
+  int speed;      // Port speed in Mbps.
+  int port;       // Port number.
+  float latency;  // Network latency
+  int maxComms;   // Maximum number of comms we can create
+  int maxRecvs;   // Maximum number of grouped receives.
+} ncclNetProperties_v6_t;
+
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v6_t;
+
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  /* DMA-BUF support */
+  ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v6_t;
+
+// v5 struct for backwards compatibility
+typedef struct {
+  // Name of the network (mainly for logs)
+  const char* name;
+  // Initialize the network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create a connection.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Connect to a handle and return a sending comm object for that peer.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with sendComm == NULL with the expectation that
+  // it will be called again until sendComm != NULL.
+  ncclResult_t (*connect)(int dev, void* handle, void** sendComm);
+  // Finalize connection establishment after remote peer has called connect.
+  // This call must not block for the connection to be established, and instead
+  // should return successfully with recvComm == NULL with the expectation that
+  // it will be called again until recvComm != NULL.
+  ncclResult_t (*accept)(void* listenComm, void** recvComm);
+  // Register/Deregister memory. Comm can be either a sendComm or a recvComm.
+  // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* comm, void* mhandle);
+  // Asynchronous send to a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request);
+  // Asynchronous recv from a peer.
+  // May return request == NULL if the call cannot be performed (or would block)
+  ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* sizes);
+  // Close and free send/recv comm objects
+  ncclResult_t (*closeSend)(void* sendComm);
+  ncclResult_t (*closeRecv)(void* recvComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclNet_v5_t;
+
+// v5 struct for backwards compatibility
+typedef struct {
+  // Name of the collective network (mainly for logs)
+  const char* name;
+  // Initialize the collective network.
+  ncclResult_t (*init)(ncclDebugLogger_t logFunction);
+  // Return the number of adapters capable of doing collective operations.
+  // If ndev returns 0, all other functions might be set to NULL.
+  ncclResult_t (*devices)(int* ndev);
+  // Get various device properties.
+  ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props);
+  // Create a receiving object and provide a handle to connect to it. The
+  // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged
+  // between ranks to create connections.
+  ncclResult_t (*listen)(int dev, void* handle, void** listenComm);
+  // Create a group for collective operations. handles have been created
+  // using listen() above. rank indicates caller's rank in the collective network.
+  ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm);
+  // Returns whether a reduction operation on a data type is supported.
+  // 1 for supported, 0 otherwise.
+  ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported);
+  // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA.
+  ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle);
+  ncclResult_t (*deregMr)(void* collComm, void* mhandle);
+  // Performs an asynchronous allreduce operation on the collective group.
+  // May return request == NULL if the call cannot be performed (or would block).
+  ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count,
+      ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request);
+  // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is
+  // visible to the GPU
+  ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request);
+  // Test whether a request is complete. If size is not NULL, it returns the
+  // number of bytes sent/received.
+  ncclResult_t (*test)(void* request, int* done, int* size);
+  // Close and free collective comm objects
+  ncclResult_t (*closeColl)(void* collComm);
+  ncclResult_t (*closeListen)(void* listenComm);
+} ncclCollNet_v5_t;
+
+#endif // end include guard
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c64e5dea9e1587c8adadaaf13f71c25525fcbd9
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/nccl/lib/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..455eaacf9185f611f994b05ae7e7e79a7f7c33a3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/include/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/include/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/include/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/include/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83766db0e8f15b4662ea40be382c14e27ac0b9f0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/include/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/include/nvJitLink.h b/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/include/nvJitLink.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc4be77a6eea692dbd06d814d0c21c80d7eaeff2
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/include/nvJitLink.h
@@ -0,0 +1,522 @@
+/*
+ * NVIDIA_COPYRIGHT_BEGIN
+ *
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * NVIDIA CORPORATION and its licensors retain all intellectual property
+ * and proprietary rights in and to this software, related documentation
+ * and any modifications thereto.  Any use, reproduction, disclosure or
+ * distribution of this software and related documentation without an express
+ * license agreement from NVIDIA CORPORATION is strictly prohibited.
+ *
+ * NVIDIA_COPYRIGHT_END
+ */
+
+#ifndef nvJitLink_INCLUDED
+#define nvJitLink_INCLUDED
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdint.h>
+#include <stdlib.h>
+
+/**
+ *
+ * \defgroup error Error codes
+ *
+ */
+
+/** \ingroup error
+ *
+ * \brief    The enumerated type nvJitLinkResult defines API call result codes.
+ *           nvJitLink APIs return nvJitLinkResult codes to indicate the result.
+ */
+
+typedef enum {
+  NVJITLINK_SUCCESS = 0,
+  NVJITLINK_ERROR_UNRECOGNIZED_OPTION,
+  NVJITLINK_ERROR_MISSING_ARCH, // -arch=sm_NN option not specified
+  NVJITLINK_ERROR_INVALID_INPUT,
+  NVJITLINK_ERROR_PTX_COMPILE,
+  NVJITLINK_ERROR_NVVM_COMPILE,
+  NVJITLINK_ERROR_INTERNAL,
+  NVJITLINK_ERROR_THREADPOOL,
+  NVJITLINK_ERROR_UNRECOGNIZED_INPUT,
+#ifdef NEW_ERROR_CODES // These error codes will appear in a future CUDA release.
+  NVJITLINK_ERROR_NULL_INPUT,
+  NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS,
+  NVJITLINK_ERROR_INCORRECT_INPUT_TYPE,
+  NVJITLINK_ERROR_ARCH_MISMATCH,
+  NVJITLINK_ERROR_OUTDATED_LIBRARY,
+  NVJITLINK_ERROR_MISSING_FATBIN
+#endif
+} nvJitLinkResult;
+
+#ifndef NEW_ERROR_CODES // To avoid breaking compatibility, we map them to existing error codes for now.
+#define NVJITLINK_ERROR_NULL_INPUT NVJITLINK_ERROR_INVALID_INPUT
+#define NVJITLINK_ERROR_INCOMPATIBLE_OPTIONS NVJITLINK_ERROR_INVALID_INPUT
+#define NVJITLINK_ERROR_INCORRECT_INPUT_TYPE NVJITLINK_ERROR_INVALID_INPUT
+#define NVJITLINK_ERROR_ARCH_MISMATCH NVJITLINK_ERROR_INTERNAL
+#define NVJITLINK_ERROR_OUTDATED_LIBRARY NVJITLINK_ERROR_INTERNAL
+#define NVJITLINK_ERROR_MISSING_FATBIN NVJITLINK_ERROR_INVALID_INPUT
+#endif  
+
+/**
+ *
+ * \defgroup linking Linking
+ *
+ */
+
+/** \ingroup linking
+ *
+ * \brief    The enumerated type nvJitLinkInputType defines the kind of inputs
+ *           that can be passed to nvJitLinkAdd* APIs.
+ */
+
+typedef enum {
+  NVJITLINK_INPUT_NONE = 0, // error
+  NVJITLINK_INPUT_CUBIN = 1,
+  NVJITLINK_INPUT_PTX,
+  NVJITLINK_INPUT_LTOIR,
+  NVJITLINK_INPUT_FATBIN,
+  NVJITLINK_INPUT_OBJECT,
+  NVJITLINK_INPUT_LIBRARY,
+  NVJITLINK_INPUT_ANY = 10 // will dynamically determine one of above types
+} nvJitLinkInputType;
+
+/**
+ * \defgroup options Supported Link Options
+ *
+ * nvJitLink supports the link options below.
+ * Option names are prefixed with a single dash (\c -).
+ * Options that take a value have an assignment operator (\c =)
+ * followed by the option value, with no spaces, e.g. \c "-arch=sm_90".
+ *
+ * The supported options are:
+ * - \c -arch=sm_<N\> \n
+ *   Pass SM architecture value.  See nvcc for valid values of <N\>.
+ *   Can use compute_<N\> value instead if only generating PTX.
+ *   This is a required option.  
+ * - \c -maxrregcount=<N\> \n
+ *   Maximum register count.
+ * - \c -time \n
+ *   Print timing information to InfoLog.
+ * - \c -verbose \n
+ *   Print verbose messages to InfoLog.
+ * - \c -lto \n
+ *   Do link time optimization.
+ * - \c -ptx \n
+ *   Emit ptx after linking instead of cubin; only supported with \c -lto
+ * - \c -O<N\> \n
+ *   Optimization level. Only 0 and 3 are accepted.
+ * - \c -g \n
+ *   Generate debug information.
+ * - \c -lineinfo \n
+ *   Generate line information.
+ * - \c -ftz=<n\> \n
+ *   Flush to zero.
+ * - \c -prec-div=<n\> \n
+ *   Precise divide.
+ * - \c -prec-sqrt=<n\> \n
+ *   Precise square root.
+ * - \c -fma=<n\> \n
+ *   Fast multiply add.
+ * - \c -kernels-used=<name\> \n
+ *   Pass list of kernels that are used; any not in the list can be removed.
+ *   This option can be specified multiple times.
+ * - \c -variables-used=<name\> \n
+ *   Pass list of variables that are used; any not in the list can be removed.
+ *   This option can be specified multiple times.
+ * - \c -optimize-unused-variables \n
+ *   Normally device code optimization is limited by not knowing what the
+ *   host code references.  With this option it can assume that if a variable
+ *   is not referenced in device code then it can be removed.
+ * - \c -Xptxas=<opt\> \n
+ *   Pass <opt\> to ptxas.  This option can be called multiple times.
+ * - \c -split-compile=<N\> \n
+ *   Split compilation maximum thread count. Use 0 to use all available processors.
+ *   Value of 1 disables split compilation (default).
+ * - \c -split-compile-extended=<N\> \n
+ *   [Experimental] A more aggressive form of split compilation.
+ *   Accepts a maximum thread count value. Use 0 to use all available processors.
+ *   Value of 1 disables extended split compilation (default).
+ * - \c -jump-table-density=<N\> \n
+ *   When doing LTO, specify the case density percentage in switch statements,
+ *   and use it as a minimal threshold to determine whether jump table(brx.idx 
+ *   instruction) will be used to implement a switch statement. Default
+ *   value is 101. The percentage ranges from 0 to 101 inclusively.
+ */
+
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkHandle is the unit of linking, and an opaque handle for
+ *          a program.
+ *
+ * To link inputs, an instance of nvJitLinkHandle must be created first with
+ * nvJitLinkCreate().
+ */
+
+typedef struct nvJitLink* nvJitLinkHandle; // opaque handle
+
+// For versioning we will have separate API version for each library version
+
+extern nvJitLinkResult __nvJitLinkCreate_12_4(
+  nvJitLinkHandle *handle,
+  uint32_t numOptions,
+  const char **options);
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkCreate creates an instance of nvJitLinkHandle with the
+ *          given input options, and sets the output parameter \p handle.
+ *
+ * \param   [out] handle       Address of nvJitLink handle.
+ * \param   [in]  numOptions   Number of options passed.
+ * \param   [in]  options      Array of size \p numOptions of option strings.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_UNRECOGNIZED_OPTION\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_MISSING_ARCH\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ *
+ * It supports options listed in \ref options.
+ *
+ * \see nvJitLinkDestroy
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkCreate(
+  nvJitLinkHandle *handle,
+  uint32_t numOptions,
+  const char **options)
+{
+  return __nvJitLinkCreate_12_4 (handle, numOptions, options);
+}
+#endif
+ 
+extern nvJitLinkResult __nvJitLinkDestroy_12_4 (nvJitLinkHandle *handle);
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkDestroy frees the memory associated with the given handle
+ *          and sets it to NULL.
+ *
+ * \param    [in] handle      Address of nvJitLink handle.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ *
+ * \see nvJitLinkCreate
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkDestroy (nvJitLinkHandle *handle)
+{
+  return __nvJitLinkDestroy_12_4 (handle);
+}
+#endif
+ 
+extern nvJitLinkResult __nvJitLinkAddData_12_4(
+  nvJitLinkHandle handle,
+  nvJitLinkInputType inputType,
+  const void *data,
+  size_t size,
+  const char *name); // name can be null
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkAddData adds data image to the link. 
+ *
+ * \param    [in] handle      nvJitLink handle.
+ * \param    [in] inputType   kind of input.
+ * \param    [in] data        pointer to data image in memory.
+ * \param    [in] size        size of the data.
+ * \param    [in] name        name of input object.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkAddData(
+  nvJitLinkHandle handle,
+  nvJitLinkInputType inputType,
+  const void *data,
+  size_t size,
+  const char *name) // name can be null
+{
+  return __nvJitLinkAddData_12_4 (handle, inputType, data, size, name);
+}
+#endif
+ 
+extern nvJitLinkResult __nvJitLinkAddFile_12_4(
+  nvJitLinkHandle handle,
+  nvJitLinkInputType inputType,
+  const char *fileName); // includes path to file
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkAddFile reads data from file and links it in. 
+ *
+ * \param    [in] handle      nvJitLink handle.
+ * \param    [in] inputType   kind of input.
+ * \param    [in] fileName    name of file.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkAddFile(
+  nvJitLinkHandle handle,
+  nvJitLinkInputType inputType,
+  const char *fileName) // includes path to file
+{
+  return __nvJitLinkAddFile_12_4 (handle, inputType, fileName);
+}
+#endif
+ 
+extern nvJitLinkResult __nvJitLinkComplete_12_4 (nvJitLinkHandle handle);
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkComplete does the actual link. 
+ *
+ * \param    [in] handle      nvJitLink handle.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkComplete (nvJitLinkHandle handle)
+{
+  return __nvJitLinkComplete_12_4 (handle);
+}
+#endif
+ 
+extern nvJitLinkResult __nvJitLinkGetLinkedCubinSize_12_4(
+  nvJitLinkHandle handle,
+  size_t *size);
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkGetLinkedCubinSize gets the size of the linked cubin.
+ *
+ * \param    [in] handle      nvJitLink handle.
+ * \param    [out] size       Size of the linked cubin.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ *
+ * \see nvJitLinkGetLinkedCubin
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkGetLinkedCubinSize(
+  nvJitLinkHandle handle,
+  size_t *size)
+{
+  return __nvJitLinkGetLinkedCubinSize_12_4 (handle, size);
+}
+#endif
+
+extern nvJitLinkResult __nvJitLinkGetLinkedCubin_12_4(
+  nvJitLinkHandle handle,
+  void *cubin);
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkGetLinkedCubin gets the linked cubin.
+ *
+ * \param    [in] handle      nvJitLink handle.
+ * \param    [out] cubin      The linked cubin.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ *
+ * User is responsible for allocating enough space to hold the \p cubin.
+ * \see nvJitLinkGetLinkedCubinSize
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkGetLinkedCubin(
+  nvJitLinkHandle handle,
+  void *cubin)
+{
+  return __nvJitLinkGetLinkedCubin_12_4 (handle, cubin);
+}
+#endif
+ 
+extern nvJitLinkResult __nvJitLinkGetLinkedPtxSize_12_4(
+  nvJitLinkHandle handle,
+  size_t *size);
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkGetLinkedPtxSize gets the size of the linked ptx.
+ *
+ * \param    [in] handle      nvJitLink handle.
+ * \param    [out] size       Size of the linked PTX.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ *
+ * Linked PTX is only available when using the \c -lto option.
+ * \see nvJitLinkGetLinkedPtx
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkGetLinkedPtxSize(
+  nvJitLinkHandle handle,
+  size_t *size)
+{
+  return __nvJitLinkGetLinkedPtxSize_12_4 (handle, size);
+}
+#endif
+
+extern nvJitLinkResult __nvJitLinkGetLinkedPtx_12_4(
+  nvJitLinkHandle handle,
+  char *ptx);
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkGetLinkedPtx gets the linked ptx.
+ *
+ * \param    [in] handle      nvJitLink handle.
+ * \param    [out] ptx        The linked PTX.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ *
+ * Linked PTX is only available when using the \c -lto option.
+ * User is responsible for allocating enough space to hold the \p ptx.
+ * \see nvJitLinkGetLinkedPtxSize
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkGetLinkedPtx(
+  nvJitLinkHandle handle,
+  char *ptx)
+{
+  return __nvJitLinkGetLinkedPtx_12_4 (handle, ptx);
+}
+#endif
+ 
+extern nvJitLinkResult __nvJitLinkGetErrorLogSize_12_4(
+  nvJitLinkHandle handle,
+  size_t *size);
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkGetErrorLogSize gets the size of the error log.
+ *
+ * \param    [in] handle      nvJitLink handle.
+ * \param    [out] size       Size of the error log.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ *
+ * \see nvJitLinkGetErrorLog
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkGetErrorLogSize(
+  nvJitLinkHandle handle,
+  size_t *size)
+{
+  return __nvJitLinkGetErrorLogSize_12_4 (handle, size);
+}
+#endif
+
+extern nvJitLinkResult __nvJitLinkGetErrorLog_12_4(
+  nvJitLinkHandle handle,
+  char *log);
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkGetErrorLog puts any error messages in the log.
+ *
+ * \param    [in] handle      nvJitLink handle.
+ * \param    [out] log        The error log.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ *
+ * User is responsible for allocating enough space to hold the \p log.
+ * \see nvJitLinkGetErrorLogSize
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkGetErrorLog(
+  nvJitLinkHandle handle,
+  char *log)
+{
+  return __nvJitLinkGetErrorLog_12_4 (handle, log);
+}
+#endif
+
+extern nvJitLinkResult __nvJitLinkGetInfoLogSize_12_4(
+  nvJitLinkHandle handle,
+  size_t *size);
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkGetInfoLogSize gets the size of the info log.
+ *
+ * \param    [in] handle      nvJitLink handle.
+ * \param    [out] size       Size of the info log.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ *
+ * \see nvJitLinkGetInfoLog
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkGetInfoLogSize(
+  nvJitLinkHandle handle,
+  size_t *size)
+{
+  return __nvJitLinkGetInfoLogSize_12_4 (handle, size);
+}
+#endif
+
+extern nvJitLinkResult __nvJitLinkGetInfoLog_12_4(
+  nvJitLinkHandle handle,
+  char *log);
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkGetInfoLog puts any info messages in the log.
+ *
+ * \param    [in] handle      nvJitLink handle.
+ * \param    [out] log        The info log.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ *
+ * User is responsible for allocating enough space to hold the \p log.
+ * \see nvJitLinkGetInfoLogSize
+ */
+#ifndef NVJITLINK_NO_INLINE
+static inline nvJitLinkResult nvJitLinkGetInfoLog(
+  nvJitLinkHandle handle,
+  char *log)
+{
+  return __nvJitLinkGetInfoLog_12_4 (handle, log);
+}
+#endif
+
+/**
+ * \ingroup linking
+ * \brief   nvJitLinkVersion returns the current version of nvJitLink.
+ *
+ * \param    [out] major        The major version.
+ * \param    [out] minor        The minor version.
+ * \return
+ *   - \link #nvJitLinkResult NVJITLINK_SUCCESS \endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INVALID_INPUT\endlink
+ *   - \link #nvJitLinkResult NVJITLINK_ERROR_INTERNAL\endlink
+ *
+ */
+extern nvJitLinkResult nvJitLinkVersion(
+  unsigned int *major,
+  unsigned int *minor);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // nvJitLink_INCLUDED
+
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/lib/__init__.py b/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/lib/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/lib/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/lib/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d7ef94a1585d9030e75eec7833638a9f389728bf
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/nvidia/nvjitlink/lib/__pycache__/__init__.cpython-311.pyc differ