koichi12 commited on Feb 12, 2025

Commit

80a73eb

verified ·

1 Parent(s): b964460

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.venv/lib/python3.11/site-packages/nvidia/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/include/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h +891 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasLt.h +1845 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h +693 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h +0 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_v2.h +478 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h +824 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.12 +3 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h +869 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/async.h +452 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_scan.h +174 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/driver_abi.h +99 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h +344 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/invoke.h +189 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/memory.h +135 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h +159 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h +419 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h +320 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/sync.h +282 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12 +3 -0
.venv/lib/python3.11/site-packages/nvidia/cudnn/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h +68 -0
.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h +671 -0
.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h +60 -0
.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_graph.h +909 -0
.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops.h +1316 -0
.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version_v9.h +70 -0
.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/cusolver/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/cusolver/include/__init__.py +0 -0
.venv/lib/python3.11/site-packages/nvidia/cusolver/include/__pycache__/__init__.cpython-311.pyc +0 -0
.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverDn.h +0 -0
.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverMg.h +318 -0
.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverRf.h +339 -0

.gitattributes CHANGED Viewed

@@ -120,3 +120,5 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
 .venv/lib/python3.11/site-packages/click/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/pyasn1/type/__pycache__/univ.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/opencv_python_headless.libs/libvpx-9f572e11.so.9.1.0 filter=lfs diff=lfs merge=lfs -text

 .venv/lib/python3.11/site-packages/click/__pycache__/core.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/pyasn1/type/__pycache__/univ.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 .venv/lib/python3.11/site-packages/opencv_python_headless.libs/libvpx-9f572e11.so.9.1.0 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12 filter=lfs diff=lfs merge=lfs -text
+.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.12 filter=lfs diff=lfs merge=lfs -text

.venv/lib/python3.11/site-packages/nvidia/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (179 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/cublas/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/cublas/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (186 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/cublas/include/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/cublas/include/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (194 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas.h ADDED Viewed

	@@ -0,0 +1,891 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ * This is the public header file for the CUBLAS library, defining the API
+ *
+ * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines)
+ * on top of the CUDA runtime.
+ */
+#if !defined(CUBLAS_H_)
+#define CUBLAS_H_
+#if defined(CUBLAS_V2_H_)
+#error "It is an error to include both cublas.h and cublas_v2.h"
+#endif
+#include <cuda_runtime.h>
+#ifndef CUBLASWINAPI
+#ifdef _WIN32
+#define CUBLASWINAPI __stdcall
+#else
+#define CUBLASWINAPI
+#endif
+#endif
+#undef CUBLASAPI
+#ifdef __CUDACC__
+#define CUBLASAPI __host__
+#else
+#define CUBLASAPI
+#endif
+#include "cublas_api.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* CUBLAS data types */
+#define cublasStatus cublasStatus_t
+cublasStatus CUBLASWINAPI cublasInit(void);
+cublasStatus CUBLASWINAPI cublasShutdown(void);
+cublasStatus CUBLASWINAPI cublasGetError(void);
+cublasStatus CUBLASWINAPI cublasGetVersion(int* version);
+cublasStatus CUBLASWINAPI cublasAlloc(int n, int elemSize, void** devicePtr);
+cublasStatus CUBLASWINAPI cublasFree(void* devicePtr);
+cublasStatus CUBLASWINAPI cublasSetKernelStream(cudaStream_t stream);
+/* ---------------- CUBLAS BLAS1 functions ---------------- */
+/* NRM2 */
+float CUBLASWINAPI cublasSnrm2(int n, const float* x, int incx);
+double CUBLASWINAPI cublasDnrm2(int n, const double* x, int incx);
+float CUBLASWINAPI cublasScnrm2(int n, const cuComplex* x, int incx);
+double CUBLASWINAPI cublasDznrm2(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* DOT */
+float CUBLASWINAPI cublasSdot(int n, const float* x, int incx, const float* y, int incy);
+double CUBLASWINAPI cublasDdot(int n, const double* x, int incx, const double* y, int incy);
+cuComplex CUBLASWINAPI cublasCdotu(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
+cuComplex CUBLASWINAPI cublasCdotc(int n, const cuComplex* x, int incx, const cuComplex* y, int incy);
+cuDoubleComplex CUBLASWINAPI cublasZdotu(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
+cuDoubleComplex CUBLASWINAPI cublasZdotc(int n, const cuDoubleComplex* x, int incx, const cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* SCAL */
+void CUBLASWINAPI cublasSscal(int n, float alpha, float* x, int incx);
+void CUBLASWINAPI cublasDscal(int n, double alpha, double* x, int incx);
+void CUBLASWINAPI cublasCscal(int n, cuComplex alpha, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZscal(int n, cuDoubleComplex alpha, cuDoubleComplex* x, int incx);
+void CUBLASWINAPI cublasCsscal(int n, float alpha, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZdscal(int n, double alpha, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* AXPY */
+void CUBLASWINAPI cublasSaxpy(int n, float alpha, const float* x, int incx, float* y, int incy);
+void CUBLASWINAPI cublasDaxpy(int n, double alpha, const double* x, int incx, double* y, int incy);
+void CUBLASWINAPI cublasCaxpy(int n, cuComplex alpha, const cuComplex* x, int incx, cuComplex* y, int incy);
+void CUBLASWINAPI
+cublasZaxpy(int n, cuDoubleComplex alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* COPY */
+void CUBLASWINAPI cublasScopy(int n, const float* x, int incx, float* y, int incy);
+void CUBLASWINAPI cublasDcopy(int n, const double* x, int incx, double* y, int incy);
+void CUBLASWINAPI cublasCcopy(int n, const cuComplex* x, int incx, cuComplex* y, int incy);
+void CUBLASWINAPI cublasZcopy(int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* SWAP */
+void CUBLASWINAPI cublasSswap(int n, float* x, int incx, float* y, int incy);
+void CUBLASWINAPI cublasDswap(int n, double* x, int incx, double* y, int incy);
+void CUBLASWINAPI cublasCswap(int n, cuComplex* x, int incx, cuComplex* y, int incy);
+void CUBLASWINAPI cublasZswap(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy);
+/*------------------------------------------------------------------------*/
+/* AMAX */
+int CUBLASWINAPI cublasIsamax(int n, const float* x, int incx);
+int CUBLASWINAPI cublasIdamax(int n, const double* x, int incx);
+int CUBLASWINAPI cublasIcamax(int n, const cuComplex* x, int incx);
+int CUBLASWINAPI cublasIzamax(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* AMIN */
+int CUBLASWINAPI cublasIsamin(int n, const float* x, int incx);
+int CUBLASWINAPI cublasIdamin(int n, const double* x, int incx);
+int CUBLASWINAPI cublasIcamin(int n, const cuComplex* x, int incx);
+int CUBLASWINAPI cublasIzamin(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* ASUM */
+float CUBLASWINAPI cublasSasum(int n, const float* x, int incx);
+double CUBLASWINAPI cublasDasum(int n, const double* x, int incx);
+float CUBLASWINAPI cublasScasum(int n, const cuComplex* x, int incx);
+double CUBLASWINAPI cublasDzasum(int n, const cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* ROT */
+void CUBLASWINAPI cublasSrot(int n, float* x, int incx, float* y, int incy, float sc, float ss);
+void CUBLASWINAPI cublasDrot(int n, double* x, int incx, double* y, int incy, double sc, double ss);
+void CUBLASWINAPI cublasCrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, cuComplex s);
+void CUBLASWINAPI
+cublasZrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double sc, cuDoubleComplex cs);
+void CUBLASWINAPI cublasCsrot(int n, cuComplex* x, int incx, cuComplex* y, int incy, float c, float s);
+void CUBLASWINAPI cublasZdrot(int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy, double c, double s);
+/*------------------------------------------------------------------------*/
+/* ROTG */
+void CUBLASWINAPI cublasSrotg(float* sa, float* sb, float* sc, float* ss);
+void CUBLASWINAPI cublasDrotg(double* sa, double* sb, double* sc, double* ss);
+void CUBLASWINAPI cublasCrotg(cuComplex* ca, cuComplex cb, float* sc, cuComplex* cs);
+void CUBLASWINAPI cublasZrotg(cuDoubleComplex* ca, cuDoubleComplex cb, double* sc, cuDoubleComplex* cs);
+/*------------------------------------------------------------------------*/
+/* ROTM */
+void CUBLASWINAPI cublasSrotm(int n, float* x, int incx, float* y, int incy, const float* sparam);
+void CUBLASWINAPI cublasDrotm(int n, double* x, int incx, double* y, int incy, const double* sparam);
+/*------------------------------------------------------------------------*/
+/* ROTMG */
+void CUBLASWINAPI cublasSrotmg(float* sd1, float* sd2, float* sx1, const float* sy1, float* sparam);
+void CUBLASWINAPI cublasDrotmg(double* sd1, double* sd2, double* sx1, const double* sy1, double* sparam);
+/* --------------- CUBLAS BLAS2 functions  ---------------- */
+/* GEMV */
+void CUBLASWINAPI cublasSgemv(char trans,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* x,
+                              int incx,
+                              float beta,
+                              float* y,
+                              int incy);
+void CUBLASWINAPI cublasDgemv(char trans,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasCgemv(char trans,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZgemv(char trans,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* GBMV */
+void CUBLASWINAPI cublasSgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* x,
+                              int incx,
+                              float beta,
+                              float* y,
+                              int incy);
+void CUBLASWINAPI cublasDgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasCgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZgbmv(char trans,
+                              int m,
+                              int n,
+                              int kl,
+                              int ku,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* TRMV */
+void CUBLASWINAPI cublasStrmv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
+void CUBLASWINAPI cublasDtrmv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtrmv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
+void CUBLASWINAPI
+cublasZtrmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TBMV */
+void CUBLASWINAPI
+cublasStbmv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
+void CUBLASWINAPI
+cublasDtbmv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtbmv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZtbmv(
+    char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TPMV */
+void CUBLASWINAPI cublasStpmv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
+void CUBLASWINAPI cublasDtpmv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
+void CUBLASWINAPI cublasCtpmv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
+void CUBLASWINAPI
+cublasZtpmv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TRSV */
+void CUBLASWINAPI cublasStrsv(char uplo, char trans, char diag, int n, const float* A, int lda, float* x, int incx);
+void CUBLASWINAPI cublasDtrsv(char uplo, char trans, char diag, int n, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtrsv(char uplo, char trans, char diag, int n, const cuComplex* A, int lda, cuComplex* x, int incx);
+void CUBLASWINAPI
+cublasZtrsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TPSV */
+void CUBLASWINAPI cublasStpsv(char uplo, char trans, char diag, int n, const float* AP, float* x, int incx);
+void CUBLASWINAPI cublasDtpsv(char uplo, char trans, char diag, int n, const double* AP, double* x, int incx);
+void CUBLASWINAPI cublasCtpsv(char uplo, char trans, char diag, int n, const cuComplex* AP, cuComplex* x, int incx);
+void CUBLASWINAPI
+cublasZtpsv(char uplo, char trans, char diag, int n, const cuDoubleComplex* AP, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* TBSV */
+void CUBLASWINAPI
+cublasStbsv(char uplo, char trans, char diag, int n, int k, const float* A, int lda, float* x, int incx);
+void CUBLASWINAPI
+cublasDtbsv(char uplo, char trans, char diag, int n, int k, const double* A, int lda, double* x, int incx);
+void CUBLASWINAPI
+cublasCtbsv(char uplo, char trans, char diag, int n, int k, const cuComplex* A, int lda, cuComplex* x, int incx);
+void CUBLASWINAPI cublasZtbsv(
+    char uplo, char trans, char diag, int n, int k, const cuDoubleComplex* A, int lda, cuDoubleComplex* x, int incx);
+/*------------------------------------------------------------------------*/
+/* SYMV/HEMV */
+void CUBLASWINAPI cublasSsymv(
+    char uplo, int n, float alpha, const float* A, int lda, const float* x, int incx, float beta, float* y, int incy);
+void CUBLASWINAPI cublasDsymv(char uplo,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasChemv(char uplo,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZhemv(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* SBMV/HBMV */
+void CUBLASWINAPI cublasSsbmv(char uplo,
+                              int n,
+                              int k,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* x,
+                              int incx,
+                              float beta,
+                              float* y,
+                              int incy);
+void CUBLASWINAPI cublasDsbmv(char uplo,
+                              int n,
+                              int k,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* x,
+                              int incx,
+                              double beta,
+                              double* y,
+                              int incy);
+void CUBLASWINAPI cublasChbmv(char uplo,
+                              int n,
+                              int k,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZhbmv(char uplo,
+                              int n,
+                              int k,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* SPMV/HPMV */
+void CUBLASWINAPI
+cublasSspmv(char uplo, int n, float alpha, const float* AP, const float* x, int incx, float beta, float* y, int incy);
+void CUBLASWINAPI cublasDspmv(
+    char uplo, int n, double alpha, const double* AP, const double* x, int incx, double beta, double* y, int incy);
+void CUBLASWINAPI cublasChpmv(char uplo,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* AP,
+                              const cuComplex* x,
+                              int incx,
+                              cuComplex beta,
+                              cuComplex* y,
+                              int incy);
+void CUBLASWINAPI cublasZhpmv(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* AP,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* y,
+                              int incy);
+/*------------------------------------------------------------------------*/
+/* GER */
+void CUBLASWINAPI
+cublasSger(int m, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
+void CUBLASWINAPI
+cublasDger(int m, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
+void CUBLASWINAPI cublasCgeru(
+    int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
+void CUBLASWINAPI cublasCgerc(
+    int m, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* A, int lda);
+void CUBLASWINAPI cublasZgeru(int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* A,
+                              int lda);
+void CUBLASWINAPI cublasZgerc(int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* A,
+                              int lda);
+/*------------------------------------------------------------------------*/
+/* SYR/HER */
+void CUBLASWINAPI cublasSsyr(char uplo, int n, float alpha, const float* x, int incx, float* A, int lda);
+void CUBLASWINAPI cublasDsyr(char uplo, int n, double alpha, const double* x, int incx, double* A, int lda);
+void CUBLASWINAPI cublasCher(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* A, int lda);
+void CUBLASWINAPI
+cublasZher(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* A, int lda);
+/*------------------------------------------------------------------------*/
+/* SPR/HPR */
+void CUBLASWINAPI cublasSspr(char uplo, int n, float alpha, const float* x, int incx, float* AP);
+void CUBLASWINAPI cublasDspr(char uplo, int n, double alpha, const double* x, int incx, double* AP);
+void CUBLASWINAPI cublasChpr(char uplo, int n, float alpha, const cuComplex* x, int incx, cuComplex* AP);
+void CUBLASWINAPI cublasZhpr(char uplo, int n, double alpha, const cuDoubleComplex* x, int incx, cuDoubleComplex* AP);
+/*------------------------------------------------------------------------*/
+/* SYR2/HER2 */
+void CUBLASWINAPI
+cublasSsyr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* A, int lda);
+void CUBLASWINAPI
+cublasDsyr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* A, int lda);
+void CUBLASWINAPI cublasCher2(char uplo,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* x,
+                              int incx,
+                              const cuComplex* y,
+                              int incy,
+                              cuComplex* A,
+                              int lda);
+void CUBLASWINAPI cublasZher2(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* A,
+                              int lda);
+/*------------------------------------------------------------------------*/
+/* SPR2/HPR2 */
+void CUBLASWINAPI
+cublasSspr2(char uplo, int n, float alpha, const float* x, int incx, const float* y, int incy, float* AP);
+void CUBLASWINAPI
+cublasDspr2(char uplo, int n, double alpha, const double* x, int incx, const double* y, int incy, double* AP);
+void CUBLASWINAPI cublasChpr2(
+    char uplo, int n, cuComplex alpha, const cuComplex* x, int incx, const cuComplex* y, int incy, cuComplex* AP);
+void CUBLASWINAPI cublasZhpr2(char uplo,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* x,
+                              int incx,
+                              const cuDoubleComplex* y,
+                              int incy,
+                              cuDoubleComplex* AP);
+/* ------------------------BLAS3 Functions ------------------------------- */
+/* GEMM */
+void CUBLASWINAPI cublasSgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* B,
+                              int ldb,
+                              float beta,
+                              float* C,
+                              int ldc);
+void CUBLASWINAPI cublasDgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* B,
+                              int ldb,
+                              double beta,
+                              double* C,
+                              int ldc);
+void CUBLASWINAPI cublasCgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* B,
+                              int ldb,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZgemm(char transa,
+                              char transb,
+                              int m,
+                              int n,
+                              int k,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* B,
+                              int ldb,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/* -------------------------------------------------------*/
+/* SYRK */
+void CUBLASWINAPI
+cublasSsyrk(char uplo, char trans, int n, int k, float alpha, const float* A, int lda, float beta, float* C, int ldc);
+void CUBLASWINAPI cublasDsyrk(
+    char uplo, char trans, int n, int k, double alpha, const double* A, int lda, double beta, double* C, int ldc);
+void CUBLASWINAPI cublasCsyrk(char uplo,
+                              char trans,
+                              int n,
+                              int k,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZsyrk(char uplo,
+                              char trans,
+                              int n,
+                              int k,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/* ------------------------------------------------------- */
+/* HERK */
+void CUBLASWINAPI cublasCherk(
+    char uplo, char trans, int n, int k, float alpha, const cuComplex* A, int lda, float beta, cuComplex* C, int ldc);
+void CUBLASWINAPI cublasZherk(char uplo,
+                              char trans,
+                              int n,
+                              int k,
+                              double alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              double beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/* ------------------------------------------------------- */
+/* SYR2K */
+void CUBLASWINAPI cublasSsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               float alpha,
+                               const float* A,
+                               int lda,
+                               const float* B,
+                               int ldb,
+                               float beta,
+                               float* C,
+                               int ldc);
+void CUBLASWINAPI cublasDsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               double alpha,
+                               const double* A,
+                               int lda,
+                               const double* B,
+                               int ldb,
+                               double beta,
+                               double* C,
+                               int ldc);
+void CUBLASWINAPI cublasCsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuComplex alpha,
+                               const cuComplex* A,
+                               int lda,
+                               const cuComplex* B,
+                               int ldb,
+                               cuComplex beta,
+                               cuComplex* C,
+                               int ldc);
+void CUBLASWINAPI cublasZsyr2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuDoubleComplex alpha,
+                               const cuDoubleComplex* A,
+                               int lda,
+                               const cuDoubleComplex* B,
+                               int ldb,
+                               cuDoubleComplex beta,
+                               cuDoubleComplex* C,
+                               int ldc);
+/* ------------------------------------------------------- */
+/* HER2K */
+void CUBLASWINAPI cublasCher2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuComplex alpha,
+                               const cuComplex* A,
+                               int lda,
+                               const cuComplex* B,
+                               int ldb,
+                               float beta,
+                               cuComplex* C,
+                               int ldc);
+void CUBLASWINAPI cublasZher2k(char uplo,
+                               char trans,
+                               int n,
+                               int k,
+                               cuDoubleComplex alpha,
+                               const cuDoubleComplex* A,
+                               int lda,
+                               const cuDoubleComplex* B,
+                               int ldb,
+                               double beta,
+                               cuDoubleComplex* C,
+                               int ldc);
+/*------------------------------------------------------------------------*/
+/* SYMM*/
+void CUBLASWINAPI cublasSsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              const float* B,
+                              int ldb,
+                              float beta,
+                              float* C,
+                              int ldc);
+void CUBLASWINAPI cublasDsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              const double* B,
+                              int ldb,
+                              double beta,
+                              double* C,
+                              int ldc);
+void CUBLASWINAPI cublasCsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* B,
+                              int ldb,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZsymm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* B,
+                              int ldb,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/*------------------------------------------------------------------------*/
+/* HEMM*/
+void CUBLASWINAPI cublasChemm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              const cuComplex* B,
+                              int ldb,
+                              cuComplex beta,
+                              cuComplex* C,
+                              int ldc);
+void CUBLASWINAPI cublasZhemm(char side,
+                              char uplo,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              const cuDoubleComplex* B,
+                              int ldb,
+                              cuDoubleComplex beta,
+                              cuDoubleComplex* C,
+                              int ldc);
+/*------------------------------------------------------------------------*/
+/* TRSM*/
+void CUBLASWINAPI cublasStrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              float* B,
+                              int ldb);
+void CUBLASWINAPI cublasDtrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              double* B,
+                              int ldb);
+void CUBLASWINAPI cublasCtrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              cuComplex* B,
+                              int ldb);
+void CUBLASWINAPI cublasZtrsm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              cuDoubleComplex* B,
+                              int ldb);
+/*------------------------------------------------------------------------*/
+/* TRMM*/
+void CUBLASWINAPI cublasStrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              float alpha,
+                              const float* A,
+                              int lda,
+                              float* B,
+                              int ldb);
+void CUBLASWINAPI cublasDtrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              double alpha,
+                              const double* A,
+                              int lda,
+                              double* B,
+                              int ldb);
+void CUBLASWINAPI cublasCtrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuComplex alpha,
+                              const cuComplex* A,
+                              int lda,
+                              cuComplex* B,
+                              int ldb);
+void CUBLASWINAPI cublasZtrmm(char side,
+                              char uplo,
+                              char transa,
+                              char diag,
+                              int m,
+                              int n,
+                              cuDoubleComplex alpha,
+                              const cuDoubleComplex* A,
+                              int lda,
+                              cuDoubleComplex* B,
+                              int ldb);
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#endif /* !defined(CUBLAS_H_) */

.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasLt.h ADDED Viewed

	@@ -0,0 +1,1845 @@

+/*
+ * Copyright 1993-2022 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#pragma once
+#ifndef CUBLASAPI
+#ifdef __CUDACC__
+#define CUBLASAPI __host__ __device__
+#else
+#define CUBLASAPI
+#endif
+#endif
+#include <cublas_api.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <stdio.h>
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+/** Opaque structure holding CUBLASLT context
+ */
+typedef struct cublasLtContext* cublasLtHandle_t;
+cublasStatus_t CUBLASWINAPI cublasLtCreate(cublasLtHandle_t* lightHandle);
+cublasStatus_t CUBLASWINAPI cublasLtDestroy(cublasLtHandle_t lightHandle);
+const char* CUBLASWINAPI cublasLtGetStatusName(cublasStatus_t status);
+const char* CUBLASWINAPI cublasLtGetStatusString(cublasStatus_t status);
+size_t CUBLASWINAPI cublasLtGetVersion(void);
+size_t CUBLASWINAPI cublasLtGetCudartVersion(void);
+cublasStatus_t CUBLASWINAPI cublasLtGetProperty(libraryPropertyType type, int* value);
+cublasStatus_t CUBLASWINAPI cublasLtHeuristicsCacheGetCapacity(size_t* capacity);
+cublasStatus_t CUBLASWINAPI cublasLtHeuristicsCacheSetCapacity(size_t capacity);
+/** Restricts usage of CPU instructions (ISA) specified by the flags in the mask.
+ *
+ * Flags can be combined with bitwise OR(|) operator. Supported flags:
+ * - 0x1 -- x86-64 AVX512 ISA
+ *
+ * Default mask: 0 (any applicable ISA is allowed).
+ *
+ * The function returns the previous value of the mask.
+ * The function takes precedence over the environment variable CUBLASLT_DISABLE_CPU_INSTRUCTIONS_MASK.
+ */
+unsigned CUBLASWINAPI cublasLtDisableCpuInstructionsSetMask(unsigned mask);
+/** Semi-opaque descriptor for matrix memory layout
+ */
+typedef struct {
+  uint64_t data[8];
+} cublasLtMatrixLayoutOpaque_t;
+/** Opaque descriptor for matrix memory layout
+ */
+typedef cublasLtMatrixLayoutOpaque_t* cublasLtMatrixLayout_t;
+/** Semi-opaque algorithm descriptor (to avoid complicated alloc/free schemes)
+ *
+ * This structure can be trivially serialized and later restored for use with the same version of cuBLAS library to save
+ * on selecting the right configuration again.
+ */
+typedef struct {
+  uint64_t data[8];
+} cublasLtMatmulAlgo_t;
+/** Semi-opaque descriptor for cublasLtMatmul() operation details
+ */
+typedef struct {
+  uint64_t data[32];
+} cublasLtMatmulDescOpaque_t;
+/** Opaque descriptor for cublasLtMatmul() operation details
+ */
+typedef cublasLtMatmulDescOpaque_t* cublasLtMatmulDesc_t;
+/** Semi-opaque descriptor for cublasLtMatrixTransform() operation details
+ */
+typedef struct {
+  uint64_t data[8];
+} cublasLtMatrixTransformDescOpaque_t;
+/** Opaque descriptor for cublasLtMatrixTransform() operation details
+ */
+typedef cublasLtMatrixTransformDescOpaque_t* cublasLtMatrixTransformDesc_t;
+/** Semi-opaque descriptor for cublasLtMatmulPreference() operation details
+ */
+typedef struct {
+  uint64_t data[8];
+} cublasLtMatmulPreferenceOpaque_t;
+/** Opaque descriptor for cublasLtMatmulAlgoGetHeuristic() configuration
+ */
+typedef cublasLtMatmulPreferenceOpaque_t* cublasLtMatmulPreference_t;
+/** Tile size (in C/D matrix Rows x Cols)
+ *
+ * General order of tile IDs is sorted by size first and by first dimension second.
+ */
+typedef enum {
+  CUBLASLT_MATMUL_TILE_UNDEFINED = 0,
+  CUBLASLT_MATMUL_TILE_8x8 = 1,
+  CUBLASLT_MATMUL_TILE_8x16 = 2,
+  CUBLASLT_MATMUL_TILE_16x8 = 3,
+  CUBLASLT_MATMUL_TILE_8x32 = 4,
+  CUBLASLT_MATMUL_TILE_16x16 = 5,
+  CUBLASLT_MATMUL_TILE_32x8 = 6,
+  CUBLASLT_MATMUL_TILE_8x64 = 7,
+  CUBLASLT_MATMUL_TILE_16x32 = 8,
+  CUBLASLT_MATMUL_TILE_32x16 = 9,
+  CUBLASLT_MATMUL_TILE_64x8 = 10,
+  CUBLASLT_MATMUL_TILE_32x32 = 11,
+  CUBLASLT_MATMUL_TILE_32x64 = 12,
+  CUBLASLT_MATMUL_TILE_64x32 = 13,
+  CUBLASLT_MATMUL_TILE_32x128 = 14,
+  CUBLASLT_MATMUL_TILE_64x64 = 15,
+  CUBLASLT_MATMUL_TILE_128x32 = 16,
+  CUBLASLT_MATMUL_TILE_64x128 = 17,
+  CUBLASLT_MATMUL_TILE_128x64 = 18,
+  CUBLASLT_MATMUL_TILE_64x256 = 19,
+  CUBLASLT_MATMUL_TILE_128x128 = 20,
+  CUBLASLT_MATMUL_TILE_256x64 = 21,
+  CUBLASLT_MATMUL_TILE_64x512 = 22,
+  CUBLASLT_MATMUL_TILE_128x256 = 23,
+  CUBLASLT_MATMUL_TILE_256x128 = 24,
+  CUBLASLT_MATMUL_TILE_512x64 = 25,
+  CUBLASLT_MATMUL_TILE_64x96 = 26,
+  CUBLASLT_MATMUL_TILE_96x64 = 27,
+  CUBLASLT_MATMUL_TILE_96x128 = 28,
+  CUBLASLT_MATMUL_TILE_128x160 = 29,
+  CUBLASLT_MATMUL_TILE_160x128 = 30,
+  CUBLASLT_MATMUL_TILE_192x128 = 31,
+  CUBLASLT_MATMUL_TILE_128x192 = 32,
+  CUBLASLT_MATMUL_TILE_128x96 = 33,
+  CUBLASLT_MATMUL_TILE_32x256 = 34,
+  CUBLASLT_MATMUL_TILE_256x32 = 35,
+  CUBLASLT_MATMUL_TILE_END
+} cublasLtMatmulTile_t;
+/** Size and number of stages in which elements are read into shared memory
+ *
+ * General order of stages IDs is sorted by stage size first and by number of stages second.
+ */
+typedef enum {
+  CUBLASLT_MATMUL_STAGES_UNDEFINED = 0,
+  CUBLASLT_MATMUL_STAGES_16x1 = 1,
+  CUBLASLT_MATMUL_STAGES_16x2 = 2,
+  CUBLASLT_MATMUL_STAGES_16x3 = 3,
+  CUBLASLT_MATMUL_STAGES_16x4 = 4,
+  CUBLASLT_MATMUL_STAGES_16x5 = 5,
+  CUBLASLT_MATMUL_STAGES_16x6 = 6,
+  CUBLASLT_MATMUL_STAGES_32x1 = 7,
+  CUBLASLT_MATMUL_STAGES_32x2 = 8,
+  CUBLASLT_MATMUL_STAGES_32x3 = 9,
+  CUBLASLT_MATMUL_STAGES_32x4 = 10,
+  CUBLASLT_MATMUL_STAGES_32x5 = 11,
+  CUBLASLT_MATMUL_STAGES_32x6 = 12,
+  CUBLASLT_MATMUL_STAGES_64x1 = 13,
+  CUBLASLT_MATMUL_STAGES_64x2 = 14,
+  CUBLASLT_MATMUL_STAGES_64x3 = 15,
+  CUBLASLT_MATMUL_STAGES_64x4 = 16,
+  CUBLASLT_MATMUL_STAGES_64x5 = 17,
+  CUBLASLT_MATMUL_STAGES_64x6 = 18,
+  CUBLASLT_MATMUL_STAGES_128x1 = 19,
+  CUBLASLT_MATMUL_STAGES_128x2 = 20,
+  CUBLASLT_MATMUL_STAGES_128x3 = 21,
+  CUBLASLT_MATMUL_STAGES_128x4 = 22,
+  CUBLASLT_MATMUL_STAGES_128x5 = 23,
+  CUBLASLT_MATMUL_STAGES_128x6 = 24,
+  CUBLASLT_MATMUL_STAGES_32x10 = 25,
+  CUBLASLT_MATMUL_STAGES_8x4 = 26,
+  CUBLASLT_MATMUL_STAGES_16x10 = 27,
+  CUBLASLT_MATMUL_STAGES_8x5 = 28,
+  CUBLASLT_MATMUL_STAGES_8x3 = 31,
+  CUBLASLT_MATMUL_STAGES_8xAUTO = 32,
+  CUBLASLT_MATMUL_STAGES_16xAUTO = 33,
+  CUBLASLT_MATMUL_STAGES_32xAUTO = 34,
+  CUBLASLT_MATMUL_STAGES_64xAUTO = 35,
+  CUBLASLT_MATMUL_STAGES_128xAUTO = 36,
+  CUBLASLT_MATMUL_STAGES_END
+} cublasLtMatmulStages_t;
+/** Thread Block Cluster size
+ *
+ * Typically dimensioned similar to cublasLtMatmulTile_t, with the third coordinate unused at this time.
+ */
+typedef enum {
+  /** Let library pick cluster shape automatically */
+  CUBLASLT_CLUSTER_SHAPE_AUTO = 0,
+  CUBLASLT_CLUSTER_SHAPE_1x1x1 = 2,
+  CUBLASLT_CLUSTER_SHAPE_2x1x1 = 3,
+  CUBLASLT_CLUSTER_SHAPE_4x1x1 = 4,
+  CUBLASLT_CLUSTER_SHAPE_1x2x1 = 5,
+  CUBLASLT_CLUSTER_SHAPE_2x2x1 = 6,
+  CUBLASLT_CLUSTER_SHAPE_4x2x1 = 7,
+  CUBLASLT_CLUSTER_SHAPE_1x4x1 = 8,
+  CUBLASLT_CLUSTER_SHAPE_2x4x1 = 9,
+  CUBLASLT_CLUSTER_SHAPE_4x4x1 = 10,
+  CUBLASLT_CLUSTER_SHAPE_8x1x1 = 11,
+  CUBLASLT_CLUSTER_SHAPE_1x8x1 = 12,
+  CUBLASLT_CLUSTER_SHAPE_8x2x1 = 13,
+  CUBLASLT_CLUSTER_SHAPE_2x8x1 = 14,
+  CUBLASLT_CLUSTER_SHAPE_16x1x1 = 15,
+  CUBLASLT_CLUSTER_SHAPE_1x16x1 = 16,
+  CUBLASLT_CLUSTER_SHAPE_3x1x1 = 17,
+  CUBLASLT_CLUSTER_SHAPE_5x1x1 = 18,
+  CUBLASLT_CLUSTER_SHAPE_6x1x1 = 19,
+  CUBLASLT_CLUSTER_SHAPE_7x1x1 = 20,
+  CUBLASLT_CLUSTER_SHAPE_9x1x1 = 21,
+  CUBLASLT_CLUSTER_SHAPE_10x1x1 = 22,
+  CUBLASLT_CLUSTER_SHAPE_11x1x1 = 23,
+  CUBLASLT_CLUSTER_SHAPE_12x1x1 = 24,
+  CUBLASLT_CLUSTER_SHAPE_13x1x1 = 25,
+  CUBLASLT_CLUSTER_SHAPE_14x1x1 = 26,
+  CUBLASLT_CLUSTER_SHAPE_15x1x1 = 27,
+  CUBLASLT_CLUSTER_SHAPE_3x2x1 = 28,
+  CUBLASLT_CLUSTER_SHAPE_5x2x1 = 29,
+  CUBLASLT_CLUSTER_SHAPE_6x2x1 = 30,
+  CUBLASLT_CLUSTER_SHAPE_7x2x1 = 31,
+  CUBLASLT_CLUSTER_SHAPE_1x3x1 = 32,
+  CUBLASLT_CLUSTER_SHAPE_2x3x1 = 33,
+  CUBLASLT_CLUSTER_SHAPE_3x3x1 = 34,
+  CUBLASLT_CLUSTER_SHAPE_4x3x1 = 35,
+  CUBLASLT_CLUSTER_SHAPE_5x3x1 = 36,
+  CUBLASLT_CLUSTER_SHAPE_3x4x1 = 37,
+  CUBLASLT_CLUSTER_SHAPE_1x5x1 = 38,
+  CUBLASLT_CLUSTER_SHAPE_2x5x1 = 39,
+  CUBLASLT_CLUSTER_SHAPE_3x5x1 = 40,
+  CUBLASLT_CLUSTER_SHAPE_1x6x1 = 41,
+  CUBLASLT_CLUSTER_SHAPE_2x6x1 = 42,
+  CUBLASLT_CLUSTER_SHAPE_1x7x1 = 43,
+  CUBLASLT_CLUSTER_SHAPE_2x7x1 = 44,
+  CUBLASLT_CLUSTER_SHAPE_1x9x1 = 45,
+  CUBLASLT_CLUSTER_SHAPE_1x10x1 = 46,
+  CUBLASLT_CLUSTER_SHAPE_1x11x1 = 47,
+  CUBLASLT_CLUSTER_SHAPE_1x12x1 = 48,
+  CUBLASLT_CLUSTER_SHAPE_1x13x1 = 49,
+  CUBLASLT_CLUSTER_SHAPE_1x14x1 = 50,
+  CUBLASLT_CLUSTER_SHAPE_1x15x1 = 51,
+  CUBLASLT_CLUSTER_SHAPE_END
+} cublasLtClusterShape_t;
+/** Inner size of the kernel
+ *
+ * Represents various aspects of internal kernel design, that don't impact CUDA grid size but may have other more subtle
+ * effects.
+ *
+ */
+typedef enum {
+  CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED = 0,
+  CUBLASLT_MATMUL_INNER_SHAPE_MMA884 = 1,
+  CUBLASLT_MATMUL_INNER_SHAPE_MMA1684 = 2,
+  CUBLASLT_MATMUL_INNER_SHAPE_MMA1688 = 3,
+  CUBLASLT_MATMUL_INNER_SHAPE_MMA16816 = 4,
+  CUBLASLT_MATMUL_INNER_SHAPE_END
+} cublasLtMatmulInnerShape_t;
+/** Pointer mode to use for alpha/beta */
+typedef enum {
+  /** matches CUBLAS_POINTER_MODE_HOST, pointer targets a single value host memory */
+  CUBLASLT_POINTER_MODE_HOST = CUBLAS_POINTER_MODE_HOST,
+  /** matches CUBLAS_POINTER_MODE_DEVICE, pointer targets a single value device memory */
+  CUBLASLT_POINTER_MODE_DEVICE = CUBLAS_POINTER_MODE_DEVICE,
+  /** pointer targets an array in device memory */
+  CUBLASLT_POINTER_MODE_DEVICE_VECTOR = 2,
+  /** alpha pointer targets an array in device memory, beta is zero. Note:
+     CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE is not supported, must be 0. */
+  CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO = 3,
+  /** alpha pointer targets an array in device memory, beta is a single value in host memory. */
+  CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST = 4,
+} cublasLtPointerMode_t;
+/** Mask to define pointer mode capability */
+typedef enum {
+  /** see CUBLASLT_POINTER_MODE_HOST */
+  CUBLASLT_POINTER_MODE_MASK_HOST = 1,
+  /** see CUBLASLT_POINTER_MODE_DEVICE */
+  CUBLASLT_POINTER_MODE_MASK_DEVICE = 2,
+  /** see CUBLASLT_POINTER_MODE_DEVICE_VECTOR */
+  CUBLASLT_POINTER_MODE_MASK_DEVICE_VECTOR = 4,
+  /** see CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO */
+  CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_ZERO = 8,
+  /** see CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST */
+  CUBLASLT_POINTER_MODE_MASK_ALPHA_DEVICE_VECTOR_BETA_HOST = 16,
+} cublasLtPointerModeMask_t;
+/** Implementation details that may affect numerical behavior of algorithms. */
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_FMA (0x01ull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_HMMA (0x02ull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_IMMA (0x04ull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_DMMA (0x08ull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_TENSOR_OP_MASK (0xfeull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_TYPE_MASK (0xffull << 0)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_16F (0x01ull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32F (0x02ull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_64F (0x04ull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_32I (0x08ull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_ACCUMULATOR_TYPE_MASK (0xffull << 8)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16F (0x01ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_16BF (0x02ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_TF32 (0x04ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_32F (0x08ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_64F (0x10ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8I (0x20ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E4M3 (0x40ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_INPUT_8F_E5M2 (0x80ull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_OP_INPUT_TYPE_MASK (0xffull << 16)
+#define CUBLASLT_NUMERICAL_IMPL_FLAGS_GAUSSIAN (0x01ull << 32)
+typedef uint64_t cublasLtNumericalImplFlags_t;
+/** Execute matrix multiplication (D = alpha * op(A) * op(B) + beta * C).
+ *
+ * \retval     CUBLAS_STATUS_NOT_INITIALIZED   if cuBLASLt handle has not been initialized
+ * \retval     CUBLAS_STATUS_INVALID_VALUE     if parameters are in conflict or in an impossible configuration; e.g.
+ *                                             when workspaceSizeInBytes is less than workspace required by configured
+ *                                             algo
+ * \retval     CUBLAS_STATUS_NOT_SUPPORTED     if current implementation on selected device doesn't support configured
+ *                                             operation
+ * \retval     CUBLAS_STATUS_ARCH_MISMATCH     if configured operation cannot be run using selected device
+ * \retval     CUBLAS_STATUS_EXECUTION_FAILED  if cuda reported execution error from the device
+ * \retval     CUBLAS_STATUS_SUCCESS           if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmul(cublasLtHandle_t lightHandle,
+                                           cublasLtMatmulDesc_t computeDesc,
+                                           const void* alpha, /* host or device pointer */
+                                           const void* A,
+                                           cublasLtMatrixLayout_t Adesc,
+                                           const void* B,
+                                           cublasLtMatrixLayout_t Bdesc,
+                                           const void* beta, /* host or device pointer */
+                                           const void* C,
+                                           cublasLtMatrixLayout_t Cdesc,
+                                           void* D,
+                                           cublasLtMatrixLayout_t Ddesc,
+                                           const cublasLtMatmulAlgo_t* algo,
+                                           void* workspace,
+                                           size_t workspaceSizeInBytes,
+                                           cudaStream_t stream);
+/** Matrix layout conversion helper (C = alpha * op(A) + beta * op(B))
+ *
+ * Can be used to change memory order of data or to scale and shift the values.
+ *
+ * \retval     CUBLAS_STATUS_NOT_INITIALIZED   if cuBLASLt handle has not been initialized
+ * \retval     CUBLAS_STATUS_INVALID_VALUE     if parameters are in conflict or in an impossible configuration; e.g.
+ *                                             when A is not NULL, but Adesc is NULL
+ * \retval     CUBLAS_STATUS_NOT_SUPPORTED     if current implementation on selected device doesn't support configured
+ *                                             operation
+ * \retval     CUBLAS_STATUS_ARCH_MISMATCH     if configured operation cannot be run using selected device
+ * \retval     CUBLAS_STATUS_EXECUTION_FAILED  if cuda reported execution error from the device
+ * \retval     CUBLAS_STATUS_SUCCESS           if the operation completed successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransform(cublasLtHandle_t lightHandle,
+                                                    cublasLtMatrixTransformDesc_t transformDesc,
+                                                    const void* alpha, /* host or device pointer */
+                                                    const void* A,
+                                                    cublasLtMatrixLayout_t Adesc,
+                                                    const void* beta, /* host or device pointer */
+                                                    const void* B,
+                                                    cublasLtMatrixLayout_t Bdesc,
+                                                    void* C,
+                                                    cublasLtMatrixLayout_t Cdesc,
+                                                    cudaStream_t stream);
+/* ---------------------------------------------------------------------------------------*/
+/* Helper functions for cublasLtMatrixLayout_t */
+/* ---------------------------------------------------------------------------------------*/
+/** Enum for data ordering */
+typedef enum {
+  /** Column-major
+   *
+   * Leading dimension is the stride (in elements) to the beginning of next column in memory.
+   */
+  CUBLASLT_ORDER_COL = 0,
+  /** Row major
+   *
+   * Leading dimension is the stride (in elements) to the beginning of next row in memory.
+   */
+  CUBLASLT_ORDER_ROW = 1,
+  /** Column-major ordered tiles of 32 columns.
+   *
+   * Leading dimension is the stride (in elements) to the beginning of next group of 32-columns. E.g. if matrix has 33
+   * columns and 2 rows, ld must be at least (32) * 2 = 64.
+   */
+  CUBLASLT_ORDER_COL32 = 2,
+  /** Column-major ordered tiles of composite tiles with total 32 columns and 8 rows, tile composed of interleaved
+   * inner tiles of 4 columns within 4 even or odd rows in an alternating pattern.
+   *
+   * Leading dimension is the stride (in elements) to the beginning of the first 32 column x 8 row tile for the next
+   * 32-wide group of columns. E.g. if matrix has 33 columns and 1 row, ld must be at least (32 * 8) * 1 = 256.
+   */
+  CUBLASLT_ORDER_COL4_4R2_8C = 3,
+  /** Column-major ordered tiles of composite tiles with total 32 columns ands 32 rows.
+   * Element offset within the tile is calculated as (((row%8)/2*4+row/8)*2+row%2)*32+col.
+   *
+   * Leading dimension is the stride (in elements) to the beginning of the first 32 column x 32 row tile for the next
+   * 32-wide group of columns. E.g. if matrix has 33 columns and 1 row, ld must be at least (32*32)*1 = 1024.
+   */
+  CUBLASLT_ORDER_COL32_2R_4R4 = 4,
+} cublasLtOrder_t;
+/** Attributes of memory layout */
+typedef enum {
+  /** Data type, see cudaDataType.
+   *
+   * uint32_t
+   */
+  CUBLASLT_MATRIX_LAYOUT_TYPE = 0,
+  /** Memory order of the data, see cublasLtOrder_t.
+   *
+   * int32_t, default: CUBLASLT_ORDER_COL
+   */
+  CUBLASLT_MATRIX_LAYOUT_ORDER = 1,
+  /** Number of rows.
+   *
+   * Usually only values that can be expressed as int32_t are supported.
+   *
+   * uint64_t
+   */
+  CUBLASLT_MATRIX_LAYOUT_ROWS = 2,
+  /** Number of columns.
+   *
+   * Usually only values that can be expressed as int32_t are supported.
+   *
+   * uint64_t
+   */
+  CUBLASLT_MATRIX_LAYOUT_COLS = 3,
+  /** Matrix leading dimension.
+   *
+   * For CUBLASLT_ORDER_COL this is stride (in elements) of matrix column, for more details and documentation for
+   * other memory orders see documentation for cublasLtOrder_t values.
+   *
+   * Currently only non-negative values are supported, must be large enough so that matrix memory locations are not
+   * overlapping (e.g. greater or equal to CUBLASLT_MATRIX_LAYOUT_ROWS in case of CUBLASLT_ORDER_COL).
+   *
+   * int64_t;
+   */
+  CUBLASLT_MATRIX_LAYOUT_LD = 4,
+  /** Number of matmul operations to perform in the batch.
+   *
+   * See also CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT
+   *
+   * int32_t, default: 1
+   */
+  CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT = 5,
+  /** Stride (in elements) to the next matrix for strided batch operation.
+   *
+   * When matrix type is planar-complex (CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET != 0), batch stride
+   * is interpreted by cublasLtMatmul() in number of real valued sub-elements. E.g. for data of type CUDA_C_16F,
+   * offset of 1024B is encoded as a stride of value 512 (since each element of the real and imaginary matrices
+   * is a 2B (16bit) floating point type).
+   *
+   * NOTE: A bug in cublasLtMatrixTransform() causes it to interpret the batch stride for a planar-complex matrix
+   * as if it was specified in number of complex elements. Therefore an offset of 1024B must be encoded as stride
+   * value 256 when calling cublasLtMatrixTransform() (each complex element is 4B with real and imaginary values 2B
+   * each). This behavior is expected to be corrected in the next major cuBLAS version.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET = 6,
+  /** Stride (in bytes) to the imaginary plane for planar complex layout.
+   *
+   * int64_t, default: 0 - 0 means that layout is regular (real and imaginary parts of complex numbers are interleaved
+   * in memory in each element)
+   */
+  CUBLASLT_MATRIX_LAYOUT_PLANE_OFFSET = 7,
+} cublasLtMatrixLayoutAttribute_t;
+/** Internal. Do not use directly.
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutInit_internal(  //
+    cublasLtMatrixLayout_t matLayout,
+    size_t size,
+    cudaDataType type,
+    uint64_t rows,
+    uint64_t cols,
+    int64_t ld);
+/** Initialize matrix layout descriptor in pre-allocated space.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+static inline cublasStatus_t cublasLtMatrixLayoutInit(
+    cublasLtMatrixLayout_t matLayout, cudaDataType type, uint64_t rows, uint64_t cols, int64_t ld) {
+  return cublasLtMatrixLayoutInit_internal(matLayout, sizeof(*matLayout), type, rows, cols, ld);
+}
+/** Create new matrix layout descriptor.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutCreate(  //
+    cublasLtMatrixLayout_t* matLayout,
+    cudaDataType type,
+    uint64_t rows,
+    uint64_t cols,
+    int64_t ld);
+/** Destroy matrix layout descriptor.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutDestroy(cublasLtMatrixLayout_t matLayout);
+/** Set matrix layout descriptor attribute.
+ *
+ * \param[in]  matLayout    The descriptor
+ * \param[in]  attr         The attribute
+ * \param[in]  buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutSetAttribute(  //
+    cublasLtMatrixLayout_t matLayout,
+    cublasLtMatrixLayoutAttribute_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+/** Get matrix layout descriptor attribute.
+ *
+ * \param[in]  matLayout    The descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixLayoutGetAttribute(  //
+    cublasLtMatrixLayout_t matLayout,
+    cublasLtMatrixLayoutAttribute_t attr,
+    void* buf,
+    size_t sizeInBytes,
+    size_t* sizeWritten);
+/* ---------------------------------------------------------------------------------------*/
+/* Helper functions for cublasLtMatmulDesc_t */
+/* ---------------------------------------------------------------------------------------*/
+/** Matmul descriptor attributes to define details of the operation. */
+typedef enum {
+  /** Compute type, see cudaDataType. Defines data type used for multiply and accumulate operations and the
+   * accumulator during matrix multiplication.
+   *
+   * int32_t
+   */
+  CUBLASLT_MATMUL_DESC_COMPUTE_TYPE = 0,
+  /** Scale type, see cudaDataType. Defines data type of alpha and beta. Accumulator and value from matrix C are
+   * typically converted to scale type before final scaling. Value is then converted from scale type to type of matrix
+   * D before being stored in memory.
+   *
+   * int32_t, default: same as CUBLASLT_MATMUL_DESC_COMPUTE_TYPE
+   */
+  CUBLASLT_MATMUL_DESC_SCALE_TYPE = 1,
+  /** Pointer mode of alpha and beta, see cublasLtPointerMode_t. When CUBLASLT_POINTER_MODE_DEVICE_VECTOR is in use,
+   * alpha/beta vector lenghts must match number of output matrix rows.
+   *
+   * int32_t, default: CUBLASLT_POINTER_MODE_HOST
+   */
+  CUBLASLT_MATMUL_DESC_POINTER_MODE = 2,
+  /** Transform of matrix A, see cublasOperation_t.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATMUL_DESC_TRANSA = 3,
+  /** Transform of matrix B, see cublasOperation_t.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATMUL_DESC_TRANSB = 4,
+  /** Transform of matrix C, see cublasOperation_t.
+   *
+   * Currently only CUBLAS_OP_N is supported.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATMUL_DESC_TRANSC = 5,
+  /** Matrix fill mode, see cublasFillMode_t.
+   *
+   * int32_t, default: CUBLAS_FILL_MODE_FULL
+   */
+  CUBLASLT_MATMUL_DESC_FILL_MODE = 6,
+  /** Epilogue function, see cublasLtEpilogue_t.
+   *
+   * uint32_t, default: CUBLASLT_EPILOGUE_DEFAULT
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE = 7,
+  /** Bias or bias gradient vector pointer in the device memory.
+   *
+   * Bias case. See CUBLASLT_EPILOGUE_BIAS.
+   * For bias data type see CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE.
+   *
+   * Bias vector length must match matrix D rows count.
+   *
+   * Bias gradient case. See CUBLASLT_EPILOGUE_DRELU_BGRAD and CUBLASLT_EPILOGUE_DGELU_BGRAD.
+   * Bias gradient vector elements are the same type as the output elements
+   * (Ctype) with the exception of IMMA kernels (see above).
+   *
+   * Routines that don't dereference this pointer, like cublasLtMatmulAlgoGetHeuristic()
+   * depend on its value to determine expected pointer alignment.
+   *
+   * Bias case: const void *, default: NULL
+   * Bias gradient case: void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_BIAS_POINTER = 8,
+  /** Batch stride for bias or bias gradient vector.
+   *
+   * Used together with CUBLASLT_MATMUL_DESC_BIAS_POINTER when matrix D's CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT > 1.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATMUL_DESC_BIAS_BATCH_STRIDE = 10,
+  /** Pointer for epilogue auxiliary buffer.
+   *
+   * - Output vector for ReLu bit-mask in forward pass when CUBLASLT_EPILOGUE_RELU_AUX
+   *   or CUBLASLT_EPILOGUE_RELU_AUX_BIAS epilogue is used.
+   * - Input vector for ReLu bit-mask in backward pass when
+   *   CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is used.
+   *
+   * - Output of GELU input matrix in forward pass when
+   *   CUBLASLT_EPILOGUE_GELU_AUX_BIAS epilogue is used.
+   * - Input of GELU input matrix for backward pass when
+   *   CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue is used.
+   *
+   * For aux data type see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE.
+   *
+   * Routines that don't dereference this pointer, like cublasLtMatmulAlgoGetHeuristic()
+   * depend on its value to determine expected pointer alignment.
+   *
+   * Requires setting CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD attribute.
+   *
+   * Forward pass: void *, default: NULL
+   * Backward pass: const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER = 11,
+  /** Leading dimension for epilogue auxiliary buffer.
+   *
+   * - ReLu bit-mask matrix leading dimension in elements (i.e. bits)
+   *   when CUBLASLT_EPILOGUE_RELU_AUX, CUBLASLT_EPILOGUE_RELU_AUX_BIAS or CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is
+   * used. Must be divisible by 128 and be no less than the number of rows in the output matrix.
+   *
+   * - GELU input matrix leading dimension in elements
+   *   when CUBLASLT_EPILOGUE_GELU_AUX_BIAS or CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue used.
+   *   Must be divisible by 8 and be no less than the number of rows in the output matrix.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD = 12,
+  /** Batch stride for epilogue auxiliary buffer.
+   *
+   * - ReLu bit-mask matrix batch stride in elements (i.e. bits)
+   *   when CUBLASLT_EPILOGUE_RELU_AUX, CUBLASLT_EPILOGUE_RELU_AUX_BIAS or CUBLASLT_EPILOGUE_DRELU_BGRAD epilogue is
+   * used. Must be divisible by 128.
+   *
+   * - GELU input matrix batch stride in elements
+   *   when CUBLASLT_EPILOGUE_GELU_AUX_BIAS or CUBLASLT_EPILOGUE_DGELU_BGRAD epilogue used.
+   *   Must be divisible by 8.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_BATCH_STRIDE = 13,
+  /** Batch stride for alpha vector.
+   *
+   * Used together with CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_HOST when matrix D's
+   * CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT > 1. If CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO is set then
+   * CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE must be set to 0 as this mode doesnt supported batched alpha vector.
+   *
+   * int64_t, default: 0
+   */
+  CUBLASLT_MATMUL_DESC_ALPHA_VECTOR_BATCH_STRIDE = 14,
+  /** Number of SMs to target for parallel execution. Optimizes heuristics for execution on a different number of SMs
+   *  when user expects a concurrent stream to be using some of the device resources.
+   *
+   *  int32_t, default: 0 - use the number reported by the device.
+   */
+  CUBLASLT_MATMUL_DESC_SM_COUNT_TARGET = 15,
+  /** Device pointer to the scale factor value that converts data in matrix A to the compute data type range.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_A_SCALE_POINTER = 17,
+  /** Device pointer to the scale factor value to convert data in matrix B to compute data type range.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_B_SCALE_POINTER = 18,
+  /** Device pointer to the scale factor value to convert data in matrix C to compute data type range.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_C_SCALE_POINTER = 19,
+  /** Device pointer to the scale factor value to convert data in matrix D to compute data type range.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  const void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_D_SCALE_POINTER = 20,
+  /** Device pointer to the memory location that on completion will be set to the maximum of absolute values in the
+   *  output matrix.
+   *
+   *  The computed value has the same type as the compute type.
+   *
+   *  If not specified or set to NULL, the maximum absolute value is not computed. If set for an unsupported matrix
+   *  data, scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
+   *
+   *  void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_AMAX_D_POINTER = 21,
+  /** Type of the data to be stored to the memory pointed to by CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   *
+   *  If unset, the data type defaults to the type of elements of the output matrix with some exceptions, see details
+   * below.
+   *
+   *  ReLu uses a bit-mask.
+   *
+   *  GELU input matrix elements type is the same as the type of elements of
+   *  the output matrix with some exceptions, see details below.
+   *
+   *  For fp8 kernels with output type CUDA_R_8F_E4M3 the aux data type can be CUDA_R_8F_E4M3 or CUDA_R_16F with some
+   *  restrictions.  See https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmulDescAttributes_t for more details.
+   *
+   *  If set for an unsupported matrix data, scale, and compute type combination, calling cublasLtMatmul()
+   *  will return CUBLAS_INVALID_VALUE.
+   *
+   *  int32_t based on cudaDataType, default: -1
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_DATA_TYPE = 22,
+  /** Device pointer to the scaling factor value to convert results from compute type data range to storage
+   *  data range in the auxiliary matrix that is set via CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   *
+   *  The scaling factor value must have the same type as the compute type.
+   *
+   *  If not specified, or set to NULL, the scaling factor is assumed to be 1. If set for an unsupported matrix data,
+   *  scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
+   *
+   *  void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_SCALE_POINTER = 23,
+  /** Device pointer to the memory location that on completion will be set to the maximum of absolute values in the
+   *  buffer that is set via CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   *
+   *  The computed value has the same type as the compute type.
+   *
+   *  If not specified or set to NULL, the maximum absolute value is not computed. If set for an unsupported matrix
+   *  data, scale, and compute type combination, calling cublasLtMatmul() will return CUBLAS_INVALID_VALUE.
+   *
+   *  void *, default: NULL
+   */
+  CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_AMAX_POINTER = 24,
+  /** Flag for managing fp8 fast accumulation mode.
+   *  When enabled, problem execution might be faster but at the cost of lower accuracy because intermediate results
+   *  will not periodically be promoted to a higher precision.
+   *
+   *  int8_t, default: 0 - fast accumulation mode is disabled.
+   */
+  CUBLASLT_MATMUL_DESC_FAST_ACCUM = 25,
+  /** Type of bias or bias gradient vector in the device memory.
+   *
+   * Bias case: see CUBLASLT_EPILOGUE_BIAS.
+   *
+   * Bias vector elements are the same type as the elements of output matrix (Dtype) with the following exceptions:
+   * - IMMA kernels with computeType=CUDA_R_32I and Ctype=CUDA_R_8I where the bias vector elements
+   *   are the same type as alpha, beta (CUBLASLT_MATMUL_DESC_SCALE_TYPE=CUDA_R_32F)
+   * - fp8 kernels with an output type of CUDA_R_32F, CUDA_R_8F_E4M3 or CUDA_R_8F_E5M2, See
+   *   https://docs.nvidia.com/cuda/cublas/index.html#cublasLtMatmul for details.
+   *
+   * int32_t based on cudaDataType, default: -1
+   */
+  CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE = 26,
+  /** EXPERIMENTAL: Number of atomic synchronization chunks in the row dimension of the output matrix D.
+   *
+   * int32_t, default 0 (atomic synchronization disabled)
+   */
+  CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_ROWS = 27,
+  /** EXPERIMENTAL: Number of atomic synchronization chunks in the column dimension of the output matrix D.
+   *
+   * int32_t, default 0 (atomic synchronization disabled)
+   */
+  CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_NUM_CHUNKS_D_COLS = 28,
+  /** EXPERIMENTAL: Pointer to a device array of input atomic counters consumed by a matmul.
+   *
+   * int32_t *, default: NULL
+   * */
+  CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_IN_COUNTERS_POINTER = 29,
+  /** EXPERIMENTAL: Pointer to a device array of output atomic counters produced by a matmul.
+   *
+   * int32_t *, default: NULL
+   * */
+  CUBLASLT_MATMUL_DESC_ATOMIC_SYNC_OUT_COUNTERS_POINTER = 30,
+} cublasLtMatmulDescAttributes_t;
+/** Internal. Do not use directly.
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescInit_internal(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    size_t size,
+    cublasComputeType_t computeType,
+    cudaDataType_t scaleType);
+/** Initialize matmul operation descriptor in pre-allocated space.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was initialized successfully
+ */
+static inline cublasStatus_t cublasLtMatmulDescInit(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    cublasComputeType_t computeType,
+    cudaDataType_t scaleType) {
+  return cublasLtMatmulDescInit_internal(matmulDesc, sizeof(*matmulDesc), computeType, scaleType);
+}
+/** Create new matmul operation descriptor.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescCreate(cublasLtMatmulDesc_t* matmulDesc,
+                                                     cublasComputeType_t computeType,
+                                                     cudaDataType_t scaleType);
+/** Destroy matmul operation descriptor.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescDestroy(cublasLtMatmulDesc_t matmulDesc);
+/** Set matmul operation descriptor attribute.
+ *
+ * \param[in]  matmulDesc   The descriptor
+ * \param[in]  attr         The attribute
+ * \param[in]  buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescSetAttribute(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    cublasLtMatmulDescAttributes_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+/** Get matmul operation descriptor attribute.
+ *
+ * \param[in]  matmulDesc   The descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulDescGetAttribute(  //
+    cublasLtMatmulDesc_t matmulDesc,
+    cublasLtMatmulDescAttributes_t attr,
+    void* buf,
+    size_t sizeInBytes,
+    size_t* sizeWritten);
+/* ---------------------------------------------------------------------------------------*/
+/* Helper functions for cublasLtMatrixTransformDesc_t */
+/* ---------------------------------------------------------------------------------------*/
+/** Matrix transform descriptor attributes to define details of the operation.
+ */
+typedef enum {
+  /** Scale type, see cudaDataType. Inputs are converted to scale type for scaling and summation and results are then
+   * converted to output type to store in memory.
+   *
+   * int32_t
+   */
+  CUBLASLT_MATRIX_TRANSFORM_DESC_SCALE_TYPE,
+  /** Pointer mode of alpha and beta, see cublasLtPointerMode_t.
+   *
+   * int32_t, default: CUBLASLT_POINTER_MODE_HOST
+   */
+  CUBLASLT_MATRIX_TRANSFORM_DESC_POINTER_MODE,
+  /** Transform of matrix A, see cublasOperation_t.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSA,
+  /** Transform of matrix B, see cublasOperation_t.
+   *
+   * int32_t, default: CUBLAS_OP_N
+   */
+  CUBLASLT_MATRIX_TRANSFORM_DESC_TRANSB,
+} cublasLtMatrixTransformDescAttributes_t;
+/** Internal. Do not use directly.
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescInit_internal(cublasLtMatrixTransformDesc_t transformDesc,
+                                                                     size_t size,
+                                                                     cudaDataType scaleType);
+/** Initialize matrix transform operation descriptor in pre-allocated space.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+static inline cublasStatus_t cublasLtMatrixTransformDescInit(cublasLtMatrixTransformDesc_t transformDesc,
+                                                             cudaDataType scaleType) {
+  return cublasLtMatrixTransformDescInit_internal(transformDesc, sizeof(*transformDesc), scaleType);
+}
+/** Create new matrix transform operation descriptor.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescCreate(cublasLtMatrixTransformDesc_t* transformDesc,
+                                                              cudaDataType scaleType);
+/** Destroy matrix transform operation descriptor.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescDestroy(cublasLtMatrixTransformDesc_t transformDesc);
+/** Set matrix transform operation descriptor attribute.
+ *
+ * \param[in]  transformDesc  The descriptor
+ * \param[in]  attr           The attribute
+ * \param[in]  buf            memory address containing the new value
+ * \param[in]  sizeInBytes    size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescSetAttribute(  //
+    cublasLtMatrixTransformDesc_t transformDesc,
+    cublasLtMatrixTransformDescAttributes_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+/** Get matrix transform operation descriptor attribute.
+ *
+ * \param[in]  transformDesc  The descriptor
+ * \param[in]  attr           The attribute
+ * \param[out] buf            memory address containing the new value
+ * \param[in]  sizeInBytes    size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten    only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number
+ * of bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatrixTransformDescGetAttribute(  //
+    cublasLtMatrixTransformDesc_t transformDesc,
+    cublasLtMatrixTransformDescAttributes_t attr,
+    void* buf,
+    size_t sizeInBytes,
+    size_t* sizeWritten);
+/** Reduction scheme for portions of the dot-product calculated in parallel (a. k. a. "split - K").
+ */
+typedef enum {
+  /** No reduction scheme, dot-product shall be performed in one sequence.
+   */
+  CUBLASLT_REDUCTION_SCHEME_NONE = 0,
+  /** Reduction is performed "in place" - using the output buffer (and output data type) and counters (in workspace) to
+   * guarantee the sequentiality.
+   */
+  CUBLASLT_REDUCTION_SCHEME_INPLACE = 1,
+  /** Intermediate results are stored in compute type in the workspace and reduced in a separate step.
+   */
+  CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE = 2,
+  /** Intermediate results are stored in output type in the workspace and reduced in a separate step.
+   */
+  CUBLASLT_REDUCTION_SCHEME_OUTPUT_TYPE = 4,
+  CUBLASLT_REDUCTION_SCHEME_MASK = 0x7,
+} cublasLtReductionScheme_t;
+/** Postprocessing options for the epilogue
+ */
+typedef enum {
+  /** No special postprocessing, just scale and quantize results if necessary.
+   */
+  CUBLASLT_EPILOGUE_DEFAULT = 1,
+  /** ReLu, apply ReLu point-wise transform to the results (x:=max(x, 0)).
+   */
+  CUBLASLT_EPILOGUE_RELU = 2,
+  /** ReLu, apply ReLu point-wise transform to the results (x:=max(x, 0)).
+   *
+   * This epilogue mode produces an extra output, a ReLu bit-mask matrix,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_RELU_AUX = (CUBLASLT_EPILOGUE_RELU | 128),
+  /** Bias, apply (broadcasted) Bias from bias vector. Bias vector length must match matrix D rows, it must be packed
+   * (stride between vector elements is 1). Bias vector is broadcasted to all columns and added before applying final
+   * postprocessing.
+   */
+  CUBLASLT_EPILOGUE_BIAS = 4,
+  /** ReLu and Bias, apply Bias and then ReLu transform
+   */
+  CUBLASLT_EPILOGUE_RELU_BIAS = (CUBLASLT_EPILOGUE_RELU | CUBLASLT_EPILOGUE_BIAS),
+  /** ReLu and Bias, apply Bias and then ReLu transform
+   *
+   * This epilogue mode produces an extra output, a ReLu bit-mask matrix,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_RELU_AUX_BIAS = (CUBLASLT_EPILOGUE_RELU_AUX | CUBLASLT_EPILOGUE_BIAS),
+  /* ReLu gradient. Apply ReLu gradient to matmul output. Store ReLu gradient in the output matrix.
+   *
+   * This epilogue mode requires an extra input,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_DRELU = 8 | 128,
+  /* ReLu and Bias gradients. Apply independently ReLu and Bias gradient to
+   * matmul output. Store ReLu gradient in the output matrix, and Bias gradient
+   * in the auxiliary output (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
+   *
+   * This epilogue mode requires an extra input,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_DRELU_BGRAD = CUBLASLT_EPILOGUE_DRELU | 16,
+  /** GELU, apply GELU point-wise transform to the results (x:=GELU(x)).
+   */
+  CUBLASLT_EPILOGUE_GELU = 32,
+  /** GELU, apply GELU point-wise transform to the results (x:=GELU(x)).
+   *
+   * This epilogue mode outputs GELU input as a separate matrix (useful for training).
+   * See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_GELU_AUX = (CUBLASLT_EPILOGUE_GELU | 128),
+  /** GELU and Bias, apply Bias and then GELU transform
+   */
+  CUBLASLT_EPILOGUE_GELU_BIAS = (CUBLASLT_EPILOGUE_GELU | CUBLASLT_EPILOGUE_BIAS),
+  /** GELU and Bias, apply Bias and then GELU transform
+   *
+   * This epilogue mode outputs GELU input as a separate matrix (useful for training).
+   * See CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_GELU_AUX_BIAS = (CUBLASLT_EPILOGUE_GELU_AUX | CUBLASLT_EPILOGUE_BIAS),
+  /* GELU gradient. Apply GELU gradient to matmul output. Store GELU gradient in the output matrix.
+   *
+   * This epilogue mode requires an extra input,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_DGELU = 64 | 128,
+  /* GELU and Bias gradients. Apply independently GELU and Bias gradient to
+   * matmul output. Store GELU gradient in the output matrix, and Bias gradient
+   * in the auxiliary output (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
+   *
+   * This epilogue mode requires an extra input,
+   * see CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER.
+   */
+  CUBLASLT_EPILOGUE_DGELU_BGRAD = CUBLASLT_EPILOGUE_DGELU | 16,
+  /** Bias gradient based on the input matrix A.
+   *
+   * The bias size corresponds to the number of rows of the matrix D.
+   * The reduction happens over the GEMM's "k" dimension.
+   *
+   * Stores Bias gradient in the auxiliary output
+   * (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
+   */
+  CUBLASLT_EPILOGUE_BGRADA = 256,
+  /** Bias gradient based on the input matrix B.
+   *
+   * The bias size corresponds to the number of columns of the matrix D.
+   * The reduction happens over the GEMM's "k" dimension.
+   *
+   * Stores Bias gradient in the auxiliary output
+   * (see CUBLASLT_MATMUL_DESC_BIAS_POINTER).
+   */
+  CUBLASLT_EPILOGUE_BGRADB = 512,
+} cublasLtEpilogue_t;
+/** Matmul heuristic search mode
+ */
+typedef enum {
+  /** ask heuristics for best algo for given usecase
+   */
+  CUBLASLT_SEARCH_BEST_FIT = 0,
+  /** only try to find best config for preconfigured algo id
+   */
+  CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID = 1,
+  /** reserved for future use
+   */
+  CUBLASLT_SEARCH_RESERVED_02 = 2,
+  /** reserved for future use
+   */
+  CUBLASLT_SEARCH_RESERVED_03 = 3,
+  /** reserved for future use
+   */
+  CUBLASLT_SEARCH_RESERVED_04 = 4,
+  /** reserved for future use
+   */
+  CUBLASLT_SEARCH_RESERVED_05 = 5,
+} cublasLtMatmulSearch_t;
+/** Algo search preference to fine tune the heuristic function. */
+typedef enum {
+  /** Search mode, see cublasLtMatmulSearch_t.
+   *
+   * uint32_t, default: CUBLASLT_SEARCH_BEST_FIT
+   */
+  CUBLASLT_MATMUL_PREF_SEARCH_MODE = 0,
+  /** Maximum allowed workspace size in bytes.
+   *
+   * uint64_t, default: 0 - no workspace allowed
+   */
+  CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES = 1,
+  /** Reduction scheme mask, see cublasLtReductionScheme_t. Filters heuristic result to only include algo configs that
+   * use one of the required modes.
+   *
+   * E.g. mask value of 0x03 will allow only INPLACE and COMPUTE_TYPE reduction schemes.
+   *
+   * uint32_t, default: CUBLASLT_REDUCTION_SCHEME_MASK (allows all reduction schemes)
+   */
+  CUBLASLT_MATMUL_PREF_REDUCTION_SCHEME_MASK = 3,
+  /** Minimum buffer alignment for matrix A (in bytes).
+   *
+   * Selecting a smaller value will exclude algorithms that can not work with matrix A that is not as strictly aligned
+   * as they need.
+   *
+   * uint32_t, default: 256
+   */
+  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES = 5,
+  /** Minimum buffer alignment for matrix B (in bytes).
+   *
+   * Selecting a smaller value will exclude algorithms that can not work with matrix B that is not as strictly aligned
+   * as they need.
+   *
+   * uint32_t, default: 256
+   */
+  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES = 6,
+  /** Minimum buffer alignment for matrix C (in bytes).
+   *
+   * Selecting a smaller value will exclude algorithms that can not work with matrix C that is not as strictly aligned
+   * as they need.
+   *
+   * uint32_t, default: 256
+   */
+  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES = 7,
+  /** Minimum buffer alignment for matrix D (in bytes).
+   *
+   * Selecting a smaller value will exclude algorithms that can not work with matrix D that is not as strictly aligned
+   * as they need.
+   *
+   * uint32_t, default: 256
+   */
+  CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_D_BYTES = 8,
+  /** Maximum wave count.
+   *
+   * See cublasLtMatmulHeuristicResult_t::wavesCount.
+   *
+   * Selecting a non-zero value will exclude algorithms that report device utilization higher than specified.
+   *
+   * float, default: 0.0f
+   */
+  CUBLASLT_MATMUL_PREF_MAX_WAVES_COUNT = 9,
+  /** Numerical implementation details mask, see cublasLtNumericalImplFlags_t. Filters heuristic result to only include
+   * algorithms that use the allowed implementations.
+   *
+   * uint64_t, default: uint64_t(-1) (allow everything)
+   */
+  CUBLASLT_MATMUL_PREF_IMPL_MASK = 12,
+} cublasLtMatmulPreferenceAttributes_t;
+/** Internal. Do not use directly.
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceInit_internal(cublasLtMatmulPreference_t pref, size_t size);
+/** Initialize matmul heuristic search preference descriptor in pre-allocated space.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if size of the pre-allocated space is insufficient
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+static inline cublasStatus_t cublasLtMatmulPreferenceInit(cublasLtMatmulPreference_t pref) {
+  return cublasLtMatmulPreferenceInit_internal(pref, sizeof(*pref));
+}
+/** Create new matmul heuristic search preference descriptor.
+ *
+ * \retval     CUBLAS_STATUS_ALLOC_FAILED  if memory could not be allocated
+ * \retval     CUBLAS_STATUS_SUCCESS       if desciptor was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceCreate(cublasLtMatmulPreference_t* pref);
+/** Destroy matmul heuristic search preference descriptor.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS  if operation was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceDestroy(cublasLtMatmulPreference_t pref);
+/** Set matmul heuristic search preference descriptor attribute.
+ *
+ * \param[in]  pref         The descriptor
+ * \param[in]  attr         The attribute
+ * \param[in]  buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceSetAttribute(  //
+    cublasLtMatmulPreference_t pref,
+    cublasLtMatmulPreferenceAttributes_t attr,
+    const void* buf,
+    size_t sizeInBytes);
+/** Get matmul heuristic search preference descriptor attribute.
+ *
+ * \param[in]  pref         The descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulPreferenceGetAttribute(  //
+    cublasLtMatmulPreference_t pref,
+    cublasLtMatmulPreferenceAttributes_t attr,
+    void* buf,
+    size_t sizeInBytes,
+    size_t* sizeWritten);
+/** Results structure used by cublasLtMatmulGetAlgo.
+ *
+ * Holds returned configured algo descriptor and its runtime properties.
+ */
+typedef struct {
+  /** Matmul algorithm descriptor.
+   *
+   * Must be initialized with cublasLtMatmulAlgoInit() if preferences' CUBLASLT_MATMUL_PERF_SEARCH_MODE is set to
+   * CUBLASLT_SEARCH_LIMITED_BY_ALGO_ID
+   */
+  cublasLtMatmulAlgo_t algo;
+  /** Actual size of workspace memory required.
+   */
+  size_t workspaceSize;
+  /** Result status, other fields are only valid if after call to cublasLtMatmulAlgoGetHeuristic() this member is set to
+   * CUBLAS_STATUS_SUCCESS.
+   */
+  cublasStatus_t state;
+  /** Waves count - a device utilization metric.
+   *
+   * wavesCount value of 1.0f suggests that when kernel is launched it will fully occupy the GPU.
+   */
+  float wavesCount;
+  int reserved[4];
+} cublasLtMatmulHeuristicResult_t;
+/** Query cublasLt heuristic for algorithm appropriate for given use case.
+ *
+ * \param[in]      lightHandle            Pointer to the allocated cuBLASLt handle for the cuBLASLt
+ *                                        context. See cublasLtHandle_t.
+ * \param[in]      operationDesc          Handle to the matrix multiplication descriptor.
+ * \param[in]      Adesc                  Handle to the layout descriptors for matrix A.
+ * \param[in]      Bdesc                  Handle to the layout descriptors for matrix B.
+ * \param[in]      Cdesc                  Handle to the layout descriptors for matrix C.
+ * \param[in]      Ddesc                  Handle to the layout descriptors for matrix D.
+ * \param[in]      preference             Pointer to the structure holding the heuristic search
+ *                                        preferences descriptor. See cublasLtMatrixLayout_t.
+ * \param[in]      requestedAlgoCount     Size of heuristicResultsArray (in elements) and requested
+ *                                        maximum number of algorithms to return.
+ * \param[in, out] heuristicResultsArray  Output algorithms and associated runtime characteristics,
+ *                                        ordered in increasing estimated compute time.
+ * \param[out]     returnAlgoCount        The number of heuristicResultsArray elements written.
+ *
+ * \retval  CUBLAS_STATUS_INVALID_VALUE   if requestedAlgoCount is less or equal to zero
+ * \retval  CUBLAS_STATUS_NOT_SUPPORTED   if no heuristic function available for current configuration
+ * \retval  CUBLAS_STATUS_SUCCESS         if query was successful, inspect
+ *                                        heuristicResultsArray[0 to (returnAlgoCount - 1)].state
+ *                                        for detail status of results
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetHeuristic(cublasLtHandle_t lightHandle,
+                                                           cublasLtMatmulDesc_t operationDesc,
+                                                           cublasLtMatrixLayout_t Adesc,
+                                                           cublasLtMatrixLayout_t Bdesc,
+                                                           cublasLtMatrixLayout_t Cdesc,
+                                                           cublasLtMatrixLayout_t Ddesc,
+                                                           cublasLtMatmulPreference_t preference,
+                                                           int requestedAlgoCount,
+                                                           cublasLtMatmulHeuristicResult_t heuristicResultsArray[],
+                                                           int* returnAlgoCount);
+/* ---------------------------------------------------------------------------------------*/
+/* Lower level API to be able to implement own Heuristic and Find routines                */
+/* ---------------------------------------------------------------------------------------*/
+/** Routine to get all algo IDs that can potentially run
+ *
+ * \param[in]  int              requestedAlgoCount requested number of algos (must be less or equal to size of algoIdsA
+ * (in elements)) \param[out] algoIdsA         array to write algoIds to \param[out] returnAlgoCount  number of algoIds
+ * actually written
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if requestedAlgoCount is less or equal to zero
+ * \retval     CUBLAS_STATUS_SUCCESS        if query was successful, inspect returnAlgoCount to get actual number of IDs
+ *                                          available
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoGetIds(cublasLtHandle_t lightHandle,
+                                                     cublasComputeType_t computeType,
+                                                     cudaDataType_t scaleType,
+                                                     cudaDataType_t Atype,
+                                                     cudaDataType_t Btype,
+                                                     cudaDataType_t Ctype,
+                                                     cudaDataType_t Dtype,
+                                                     int requestedAlgoCount,
+                                                     int algoIdsArray[],
+                                                     int* returnAlgoCount);
+/** Initialize algo structure
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if algo is NULL or algoId is outside of recognized range
+ * \retval     CUBLAS_STATUS_NOT_SUPPORTED  if algoId is not supported for given combination of data types
+ * \retval     CUBLAS_STATUS_SUCCESS        if the structure was successfully initialized
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoInit(cublasLtHandle_t lightHandle,
+                                                   cublasComputeType_t computeType,
+                                                   cudaDataType_t scaleType,
+                                                   cudaDataType_t Atype,
+                                                   cudaDataType_t Btype,
+                                                   cudaDataType_t Ctype,
+                                                   cudaDataType_t Dtype,
+                                                   int algoId,
+                                                   cublasLtMatmulAlgo_t* algo);
+/** Check configured algo descriptor for correctness and support on current device.
+ *
+ * Result includes required workspace size and calculated wave count.
+ *
+ * CUBLAS_STATUS_SUCCESS doesn't fully guarantee algo will run (will fail if e.g. buffers are not correctly aligned);
+ * but if cublasLtMatmulAlgoCheck fails, the algo will not run.
+ *
+ * \param[in]  algo    algo configuration to check
+ * \param[out] result  result structure to report algo runtime characteristics; algo field is never updated
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if matrix layout descriptors or operation descriptor don't match algo
+ *                                          descriptor
+ * \retval     CUBLAS_STATUS_NOT_SUPPORTED  if algo configuration or data type combination is not currently supported on
+ *                                          given device
+ * \retval     CUBLAS_STATUS_ARCH_MISMATCH  if algo configuration cannot be run using the selected device
+ * \retval     CUBLAS_STATUS_SUCCESS        if check was successful
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCheck(  //
+    cublasLtHandle_t lightHandle,
+    cublasLtMatmulDesc_t operationDesc,
+    cublasLtMatrixLayout_t Adesc,
+    cublasLtMatrixLayout_t Bdesc,
+    cublasLtMatrixLayout_t Cdesc,
+    cublasLtMatrixLayout_t Ddesc,
+    const cublasLtMatmulAlgo_t* algo,  ///< may point to result->algo
+    cublasLtMatmulHeuristicResult_t* result);
+/** Capabilities Attributes that can be retrieved from an initialized Algo structure
+ */
+typedef enum {
+  /** support for split K, see CUBLASLT_ALGO_CONFIG_SPLITK_NUM
+   *
+   * int32_t, 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_SPLITK_SUPPORT = 0,
+  /** reduction scheme mask, see cublasLtReductionScheme_t; shows supported reduction schemes, if reduction scheme is
+   * not masked out it is supported.
+   *
+   * e.g. int isReductionSchemeComputeTypeSupported ? (reductionSchemeMask & CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE) ==
+   * CUBLASLT_REDUCTION_SCHEME_COMPUTE_TYPE ? 1 : 0;
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_REDUCTION_SCHEME_MASK = 1,
+  /** support for cta swizzling, see CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING
+   *
+   * uint32_t, 0 means no support, 1 means supported value of 1, other values are reserved
+   */
+  CUBLASLT_ALGO_CAP_CTA_SWIZZLING_SUPPORT = 2,
+  /** support strided batch
+   *
+   * int32_t, 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_STRIDED_BATCH_SUPPORT = 3,
+  /** support results out of place (D != C in D = alpha.A.B + beta.C)
+   *
+   * int32_t, 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_OUT_OF_PLACE_RESULT_SUPPORT = 4,
+  /** syrk/herk support (on top of regular gemm)
+   *
+   * int32_t, 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_UPLO_SUPPORT = 5,
+  /** tile ids possible to use, see cublasLtMatmulTile_t; if no tile ids are supported use
+   * CUBLASLT_MATMUL_TILE_UNDEFINED
+   *
+   * use cublasLtMatmulAlgoCapGetAttribute() with sizeInBytes=0 to query actual count
+   *
+   * array of uint32_t
+   */
+  CUBLASLT_ALGO_CAP_TILE_IDS = 6,
+  /** custom option range is from 0 to CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX (inclusive), see
+   * CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION
+   *
+   * int32_t
+   */
+  CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX = 7,
+  /** whether algorithm supports custom (not COL or ROW memory order), see cublasLtOrder_t
+   *
+   * int32_t 0 means only COL and ROW memory order is allowed, non-zero means that algo might have different
+   * requirements;
+   */
+  CUBLASLT_ALGO_CAP_CUSTOM_MEMORY_ORDER = 10,
+  /** bitmask enumerating pointer modes algorithm supports
+   *
+   * uint32_t, see cublasLtPointerModeMask_t
+   */
+  CUBLASLT_ALGO_CAP_POINTER_MODE_MASK = 11,
+  /** bitmask enumerating kinds of postprocessing algorithm supports in the epilogue
+   *
+   * uint32_t, see cublasLtEpilogue_t
+   */
+  CUBLASLT_ALGO_CAP_EPILOGUE_MASK = 12,
+  /** stages ids possible to use, see cublasLtMatmulStages_t; if no stages ids are supported use
+   * CUBLASLT_MATMUL_STAGES_UNDEFINED
+   *
+   * use cublasLtMatmulAlgoCapGetAttribute() with sizeInBytes=0 to query actual count
+   *
+   * array of uint32_t
+   */
+  CUBLASLT_ALGO_CAP_STAGES_IDS = 13,
+  /** support for nagative ld for all of the matrices
+   *
+   * int32_t 0 means no support, supported otherwise
+   */
+  CUBLASLT_ALGO_CAP_LD_NEGATIVE = 14,
+  /** details about algorithm's implementation that affect it's numerical behavior
+   *
+   * uint64_t, see cublasLtNumericalImplFlags_t
+   */
+  CUBLASLT_ALGO_CAP_NUMERICAL_IMPL_FLAGS = 15,
+  /** minimum alignment required for A matrix in bytes
+   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_A_BYTES = 16,
+  /** minimum alignment required for B matrix in bytes
+   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_B_BYTES = 17,
+  /** minimum alignment required for C matrix in bytes
+   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_C_BYTES = 18,
+  /** minimum alignment required for D matrix in bytes
+   *  (required for buffer pointer, leading dimension, and possibly other strides defined for matrix memory order)
+   *
+   * uint32_t
+   */
+  CUBLASLT_ALGO_CAP_MIN_ALIGNMENT_D_BYTES = 19,
+  /** EXPERIMENTAL: support for synchronization via atomic counters
+   *
+   * int32_t
+   */
+  CUBLASLT_ALGO_CAP_ATOMIC_SYNC = 20,
+} cublasLtMatmulAlgoCapAttributes_t;
+/** Get algo capability attribute.
+ *
+ * E.g. to get list of supported Tile IDs:
+ *      cublasLtMatmulTile_t tiles[CUBLASLT_MATMUL_TILE_END];
+ *      size_t num_tiles, size_written;
+ *      if (cublasLtMatmulAlgoCapGetAttribute(algo, CUBLASLT_ALGO_CAP_TILE_IDS, tiles, sizeof(tiles), size_written) ==
+ * CUBLAS_STATUS_SUCCESS) { num_tiles = size_written / sizeof(tiles[0]);
+ *      }
+ *
+ * \param[in]  algo         The algo descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoCapGetAttribute(const cublasLtMatmulAlgo_t* algo,
+                                                              cublasLtMatmulAlgoCapAttributes_t attr,
+                                                              void* buf,
+                                                              size_t sizeInBytes,
+                                                              size_t* sizeWritten);
+/** Algo Configuration Attributes that can be set according to the Algo capabilities
+ */
+typedef enum {
+  /** algorithm index, see cublasLtMatmulAlgoGetIds()
+   *
+   * readonly, set by cublasLtMatmulAlgoInit()
+   * int32_t
+   */
+  CUBLASLT_ALGO_CONFIG_ID = 0,
+  /** tile id, see cublasLtMatmulTile_t
+   *
+   * uint32_t, default: CUBLASLT_MATMUL_TILE_UNDEFINED
+   */
+  CUBLASLT_ALGO_CONFIG_TILE_ID = 1,
+  /** Number of K splits. If the number of K splits is greater than one, SPLITK_NUM parts
+   * of matrix multiplication will be computed in parallel. The results will be accumulated
+   * according to CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME
+   *
+   * int32_t, default: 1
+   */
+  CUBLASLT_ALGO_CONFIG_SPLITK_NUM = 2,
+  /** reduction scheme, see cublasLtReductionScheme_t
+   *
+   * uint32_t, default: CUBLASLT_REDUCTION_SCHEME_NONE
+   */
+  CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME = 3,
+  /** cta swizzling, change mapping from CUDA grid coordinates to parts of the matrices
+   *
+   * possible values: 0, 1, other values reserved
+   *
+   * uint32_t, default: 0
+   */
+  CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING = 4,
+  /** custom option, each algorithm can support some custom options that don't fit description of the other config
+   * attributes, see CUBLASLT_ALGO_CAP_CUSTOM_OPTION_MAX to get accepted range for any specific case
+   *
+   * uint32_t, default: 0
+   */
+  CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION = 5,
+  /** stages id, see cublasLtMatmulStages_t
+   *
+   * uint32_t, default: CUBLASLT_MATMUL_STAGES_UNDEFINED
+   */
+  CUBLASLT_ALGO_CONFIG_STAGES_ID = 6,
+  /** inner shape id, see cublasLtMatmulInnerShape_t
+   *
+   * uint16_t, default: 0 (CUBLASLT_MATMUL_INNER_SHAPE_UNDEFINED)
+   */
+  CUBLASLT_ALGO_CONFIG_INNER_SHAPE_ID = 7,
+  /** Thread Block Cluster shape id, see cublasLtClusterShape_t. Defines cluster size to use.
+   *
+   * uint16_t, default: 0 (CUBLASLT_CLUSTER_SHAPE_AUTO)
+   */
+  CUBLASLT_ALGO_CONFIG_CLUSTER_SHAPE_ID = 8,
+} cublasLtMatmulAlgoConfigAttributes_t;
+/** Set algo configuration attribute.
+ *
+ * \param[in]  algo         The algo descriptor
+ * \param[in]  attr         The attribute
+ * \param[in]  buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigSetAttribute(cublasLtMatmulAlgo_t* algo,
+                                                                 cublasLtMatmulAlgoConfigAttributes_t attr,
+                                                                 const void* buf,
+                                                                 size_t sizeInBytes);
+/** Get algo configuration attribute.
+ *
+ * \param[in]  algo         The algo descriptor
+ * \param[in]  attr         The attribute
+ * \param[out] buf          memory address containing the new value
+ * \param[in]  sizeInBytes  size of buf buffer for verification (in bytes)
+ * \param[out] sizeWritten  only valid when return value is CUBLAS_STATUS_SUCCESS. If sizeInBytes is non-zero: number of
+ *                          bytes actually written, if sizeInBytes is 0: number of bytes needed to write full contents
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if sizeInBytes is 0 and sizeWritten is NULL, or if  sizeInBytes is non-zero
+ *                                          and buf is NULL or sizeInBytes doesn't match size of internal storage for
+ *                                          selected attribute
+ * \retval     CUBLAS_STATUS_SUCCESS        if attribute's value was successfully written to user memory
+ */
+cublasStatus_t CUBLASWINAPI cublasLtMatmulAlgoConfigGetAttribute(const cublasLtMatmulAlgo_t* algo,
+                                                                 cublasLtMatmulAlgoConfigAttributes_t attr,
+                                                                 void* buf,
+                                                                 size_t sizeInBytes,
+                                                                 size_t* sizeWritten);
+/** Experimental: Logger callback type.
+ */
+typedef void (*cublasLtLoggerCallback_t)(int logLevel, const char* functionName, const char* message);
+/** Experimental: Logger callback setter.
+ *
+ * \param[in]  callback                     a user defined callback function to be called by the logger
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if callback was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerSetCallback(cublasLtLoggerCallback_t callback);
+/** Experimental: Log file setter.
+ *
+ * \param[in]  file                         an open file with write permissions
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if log file was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerSetFile(FILE* file);
+/** Experimental: Open log file.
+ *
+ * \param[in]  logFile                      log file path. if the log file does not exist, it will be created
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if log file was created successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerOpenFile(const char* logFile);
+/** Experimental: Log level setter.
+ *
+ * \param[in]  level                        log level, should be one of the following:
+ *                                          0. Off
+ *                                          1. Errors
+ *                                          2. Performance Trace
+ *                                          3. Performance Hints
+ *                                          4. Heuristics Trace
+ *                                          5. API Trace
+ *
+ * \retval     CUBLAS_STATUS_INVALID_VALUE  if log level is not one of the above levels
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if log level was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerSetLevel(int level);
+/** Experimental: Log mask setter.
+ *
+ * \param[in]  mask                         log mask, should be a combination of the following masks:
+ *                                          0.  Off
+ *                                          1.  Errors
+ *                                          2.  Performance Trace
+ *                                          4.  Performance Hints
+ *                                          8.  Heuristics Trace
+ *                                          16. API Trace
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if log mask was set successfully
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerSetMask(int mask);
+/** Experimental: Disable logging for the entire session.
+ *
+ * \retval     CUBLAS_STATUS_SUCCESS        if disabled logging
+ */
+cublasStatus_t CUBLASWINAPI cublasLtLoggerForceDisable();
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */

.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublasXt.h ADDED Viewed

	@@ -0,0 +1,693 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cublasXt : Host API, Out of Core and Multi-GPU BLAS Library
+*/
+#if !defined(CUBLAS_XT_H_)
+#define CUBLAS_XT_H_
+#include "driver_types.h"
+#include "cuComplex.h" /* import complex data type */
+#include "cublas_v2.h"
+#if defined(__cplusplus)
+extern "C" {
+#endif /* __cplusplus */
+struct cublasXtContext;
+typedef struct cublasXtContext* cublasXtHandle_t;
+cublasStatus_t CUBLASWINAPI cublasXtCreate(cublasXtHandle_t* handle);
+cublasStatus_t CUBLASWINAPI cublasXtDestroy(cublasXtHandle_t handle);
+cublasStatus_t CUBLASWINAPI cublasXtGetNumBoards(int nbDevices, int deviceId[], int* nbBoards);
+cublasStatus_t CUBLASWINAPI cublasXtMaxBoards(int* nbGpuBoards);
+/* This routine selects the Gpus that the user want to use for CUBLAS-XT */
+cublasStatus_t CUBLASWINAPI cublasXtDeviceSelect(cublasXtHandle_t handle, int nbDevices, int deviceId[]);
+/* This routine allows to change the dimension of the tiles ( blockDim x blockDim ) */
+cublasStatus_t CUBLASWINAPI cublasXtSetBlockDim(cublasXtHandle_t handle, int blockDim);
+cublasStatus_t CUBLASWINAPI cublasXtGetBlockDim(cublasXtHandle_t handle, int* blockDim);
+typedef enum { CUBLASXT_PINNING_DISABLED = 0, CUBLASXT_PINNING_ENABLED = 1 } cublasXtPinnedMemMode_t;
+/* This routine allows to CUBLAS-XT to pin the Host memory if it find out that some of the matrix passed
+   are not pinned : Pinning/Unpinning the Host memory is still a costly operation
+   It is better if the user controls the memory on its own (by pinning/unpinning oly when necessary)
+*/
+cublasStatus_t CUBLASWINAPI cublasXtGetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t* mode);
+cublasStatus_t CUBLASWINAPI cublasXtSetPinningMemMode(cublasXtHandle_t handle, cublasXtPinnedMemMode_t mode);
+/* This routines is to provide a CPU Blas routines, used for too small sizes or hybrid computation */
+typedef enum {
+  CUBLASXT_FLOAT = 0,
+  CUBLASXT_DOUBLE = 1,
+  CUBLASXT_COMPLEX = 2,
+  CUBLASXT_DOUBLECOMPLEX = 3,
+} cublasXtOpType_t;
+typedef enum {
+  CUBLASXT_GEMM = 0,
+  CUBLASXT_SYRK = 1,
+  CUBLASXT_HERK = 2,
+  CUBLASXT_SYMM = 3,
+  CUBLASXT_HEMM = 4,
+  CUBLASXT_TRSM = 5,
+  CUBLASXT_SYR2K = 6,
+  CUBLASXT_HER2K = 7,
+  CUBLASXT_SPMM = 8,
+  CUBLASXT_SYRKX = 9,
+  CUBLASXT_HERKX = 10,
+  CUBLASXT_TRMM = 11,
+  CUBLASXT_ROUTINE_MAX = 12,
+} cublasXtBlasOp_t;
+/* Currently only 32-bit integer BLAS routines are supported */
+cublasStatus_t CUBLASWINAPI cublasXtSetCpuRoutine(cublasXtHandle_t handle,
+                                                  cublasXtBlasOp_t blasOp,
+                                                  cublasXtOpType_t type,
+                                                  void* blasFunctor);
+/* Specified the percentage of work that should done by the CPU, default is 0 (no work) */
+cublasStatus_t CUBLASWINAPI cublasXtSetCpuRatio(cublasXtHandle_t handle,
+                                                cublasXtBlasOp_t blasOp,
+                                                cublasXtOpType_t type,
+                                                float ratio);
+/* GEMM */
+cublasStatus_t CUBLASWINAPI cublasXtSgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* B,
+                                          size_t ldb,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* B,
+                                          size_t ldb,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZgemm(cublasXtHandle_t handle,
+                                          cublasOperation_t transa,
+                                          cublasOperation_t transb,
+                                          size_t m,
+                                          size_t n,
+                                          size_t k,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* ------------------------------------------------------- */
+/* SYRK */
+cublasStatus_t CUBLASWINAPI cublasXtSsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZsyrk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HERK */
+cublasStatus_t CUBLASWINAPI cublasXtCherk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const float* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const float* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZherk(cublasXtHandle_t handle,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          size_t n,
+                                          size_t k,
+                                          const double* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const double* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* SYR2K */
+cublasStatus_t CUBLASWINAPI cublasXtSsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const float* alpha,
+                                           const float* A,
+                                           size_t lda,
+                                           const float* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           float* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const double* alpha,
+                                           const double* A,
+                                           size_t lda,
+                                           const double* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           double* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const cuComplex* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZsyr2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const cuDoubleComplex* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HERKX : variant extension of HERK */
+cublasStatus_t CUBLASWINAPI cublasXtCherkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZherkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+/* -------------------------------------------------------------------- */
+/* TRSM */
+cublasStatus_t CUBLASWINAPI cublasXtStrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          float* B,
+                                          size_t ldb);
+cublasStatus_t CUBLASWINAPI cublasXtDtrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          double* B,
+                                          size_t ldb);
+cublasStatus_t CUBLASWINAPI cublasXtCtrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          cuComplex* B,
+                                          size_t ldb);
+cublasStatus_t CUBLASWINAPI cublasXtZtrsm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          cuDoubleComplex* B,
+                                          size_t ldb);
+/* -------------------------------------------------------------------- */
+/* SYMM : Symmetric Multiply Matrix*/
+cublasStatus_t CUBLASWINAPI cublasXtSsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* B,
+                                          size_t ldb,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* B,
+                                          size_t ldb,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZsymm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HEMM : Hermitian Matrix Multiply */
+cublasStatus_t CUBLASWINAPI cublasXtChemm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZhemm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* SYRKX : variant extension of SYRK  */
+cublasStatus_t CUBLASWINAPI cublasXtSsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const float* alpha,
+                                           const float* A,
+                                           size_t lda,
+                                           const float* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           float* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const double* alpha,
+                                           const double* A,
+                                           size_t lda,
+                                           const double* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           double* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const cuComplex* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZsyrkx(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const cuDoubleComplex* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+/* -------------------------------------------------------------------- */
+/* HER2K : variant extension of HERK  */
+cublasStatus_t CUBLASWINAPI cublasXtCher2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuComplex* alpha,
+                                           const cuComplex* A,
+                                           size_t lda,
+                                           const cuComplex* B,
+                                           size_t ldb,
+                                           const float* beta,
+                                           cuComplex* C,
+                                           size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZher2k(cublasXtHandle_t handle,
+                                           cublasFillMode_t uplo,
+                                           cublasOperation_t trans,
+                                           size_t n,
+                                           size_t k,
+                                           const cuDoubleComplex* alpha,
+                                           const cuDoubleComplex* A,
+                                           size_t lda,
+                                           const cuDoubleComplex* B,
+                                           size_t ldb,
+                                           const double* beta,
+                                           cuDoubleComplex* C,
+                                           size_t ldc);
+/* -------------------------------------------------------------------- */
+/* SPMM : Symmetric Packed Multiply Matrix*/
+cublasStatus_t CUBLASWINAPI cublasXtSspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* AP,
+                                          const float* B,
+                                          size_t ldb,
+                                          const float* beta,
+                                          float* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* AP,
+                                          const double* B,
+                                          size_t ldb,
+                                          const double* beta,
+                                          double* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* AP,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          const cuComplex* beta,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZspmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* AP,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          const cuDoubleComplex* beta,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+/* -------------------------------------------------------------------- */
+/* TRMM */
+cublasStatus_t CUBLASWINAPI cublasXtStrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const float* alpha,
+                                          const float* A,
+                                          size_t lda,
+                                          const float* B,
+                                          size_t ldb,
+                                          float* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtDtrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const double* alpha,
+                                          const double* A,
+                                          size_t lda,
+                                          const double* B,
+                                          size_t ldb,
+                                          double* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtCtrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuComplex* alpha,
+                                          const cuComplex* A,
+                                          size_t lda,
+                                          const cuComplex* B,
+                                          size_t ldb,
+                                          cuComplex* C,
+                                          size_t ldc);
+cublasStatus_t CUBLASWINAPI cublasXtZtrmm(cublasXtHandle_t handle,
+                                          cublasSideMode_t side,
+                                          cublasFillMode_t uplo,
+                                          cublasOperation_t trans,
+                                          cublasDiagType_t diag,
+                                          size_t m,
+                                          size_t n,
+                                          const cuDoubleComplex* alpha,
+                                          const cuDoubleComplex* A,
+                                          size_t lda,
+                                          const cuDoubleComplex* B,
+                                          size_t ldb,
+                                          cuDoubleComplex* C,
+                                          size_t ldc);
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#endif /* !defined(CUBLAS_XT_H_) */

.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_v2.h ADDED Viewed

	@@ -0,0 +1,478 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ * This is the public header file for the new CUBLAS library API, it mapped the generic
+ * Cublas name functions to the actual _v2 implementations.
+ */
+#if !defined(CUBLAS_V2_H_)
+#define CUBLAS_V2_H_
+#if defined(CUBLAS_H_)
+#error "It is an error to include both cublas.h and cublas_v2.h"
+#endif
+#undef CUBLASAPI
+#ifdef __CUDACC__
+#define CUBLASAPI __host__ __device__
+#else
+#define CUBLASAPI
+#endif
+#include "cublas_api.h"
+#define cublasCreate cublasCreate_v2
+#define cublasDestroy cublasDestroy_v2
+#define cublasGetVersion cublasGetVersion_v2
+#define cublasSetWorkspace cublasSetWorkspace_v2
+#define cublasSetStream cublasSetStream_v2
+#define cublasGetStream cublasGetStream_v2
+#define cublasGetPointerMode cublasGetPointerMode_v2
+#define cublasSetPointerMode cublasSetPointerMode_v2
+/* 32-bit integer */
+/* Blas1 Routines   */
+#define cublasSnrm2 cublasSnrm2_v2
+#define cublasDnrm2 cublasDnrm2_v2
+#define cublasScnrm2 cublasScnrm2_v2
+#define cublasDznrm2 cublasDznrm2_v2
+#define cublasSdot cublasSdot_v2
+#define cublasDdot cublasDdot_v2
+#define cublasCdotu cublasCdotu_v2
+#define cublasCdotc cublasCdotc_v2
+#define cublasZdotu cublasZdotu_v2
+#define cublasZdotc cublasZdotc_v2
+#define cublasSscal cublasSscal_v2
+#define cublasDscal cublasDscal_v2
+#define cublasCscal cublasCscal_v2
+#define cublasCsscal cublasCsscal_v2
+#define cublasZscal cublasZscal_v2
+#define cublasZdscal cublasZdscal_v2
+#define cublasSaxpy cublasSaxpy_v2
+#define cublasDaxpy cublasDaxpy_v2
+#define cublasCaxpy cublasCaxpy_v2
+#define cublasZaxpy cublasZaxpy_v2
+#define cublasScopy cublasScopy_v2
+#define cublasDcopy cublasDcopy_v2
+#define cublasCcopy cublasCcopy_v2
+#define cublasZcopy cublasZcopy_v2
+#define cublasSswap cublasSswap_v2
+#define cublasDswap cublasDswap_v2
+#define cublasCswap cublasCswap_v2
+#define cublasZswap cublasZswap_v2
+#define cublasIsamax cublasIsamax_v2
+#define cublasIdamax cublasIdamax_v2
+#define cublasIcamax cublasIcamax_v2
+#define cublasIzamax cublasIzamax_v2
+#define cublasIsamin cublasIsamin_v2
+#define cublasIdamin cublasIdamin_v2
+#define cublasIcamin cublasIcamin_v2
+#define cublasIzamin cublasIzamin_v2
+#define cublasSasum cublasSasum_v2
+#define cublasDasum cublasDasum_v2
+#define cublasScasum cublasScasum_v2
+#define cublasDzasum cublasDzasum_v2
+#define cublasSrot cublasSrot_v2
+#define cublasDrot cublasDrot_v2
+#define cublasCrot cublasCrot_v2
+#define cublasCsrot cublasCsrot_v2
+#define cublasZrot cublasZrot_v2
+#define cublasZdrot cublasZdrot_v2
+#define cublasSrotg cublasSrotg_v2
+#define cublasDrotg cublasDrotg_v2
+#define cublasCrotg cublasCrotg_v2
+#define cublasZrotg cublasZrotg_v2
+#define cublasSrotm cublasSrotm_v2
+#define cublasDrotm cublasDrotm_v2
+#define cublasSrotmg cublasSrotmg_v2
+#define cublasDrotmg cublasDrotmg_v2
+/* Blas2 Routines */
+#define cublasSgemv cublasSgemv_v2
+#define cublasDgemv cublasDgemv_v2
+#define cublasCgemv cublasCgemv_v2
+#define cublasZgemv cublasZgemv_v2
+#define cublasSgbmv cublasSgbmv_v2
+#define cublasDgbmv cublasDgbmv_v2
+#define cublasCgbmv cublasCgbmv_v2
+#define cublasZgbmv cublasZgbmv_v2
+#define cublasStrmv cublasStrmv_v2
+#define cublasDtrmv cublasDtrmv_v2
+#define cublasCtrmv cublasCtrmv_v2
+#define cublasZtrmv cublasZtrmv_v2
+#define cublasStbmv cublasStbmv_v2
+#define cublasDtbmv cublasDtbmv_v2
+#define cublasCtbmv cublasCtbmv_v2
+#define cublasZtbmv cublasZtbmv_v2
+#define cublasStpmv cublasStpmv_v2
+#define cublasDtpmv cublasDtpmv_v2
+#define cublasCtpmv cublasCtpmv_v2
+#define cublasZtpmv cublasZtpmv_v2
+#define cublasStrsv cublasStrsv_v2
+#define cublasDtrsv cublasDtrsv_v2
+#define cublasCtrsv cublasCtrsv_v2
+#define cublasZtrsv cublasZtrsv_v2
+#define cublasStpsv cublasStpsv_v2
+#define cublasDtpsv cublasDtpsv_v2
+#define cublasCtpsv cublasCtpsv_v2
+#define cublasZtpsv cublasZtpsv_v2
+#define cublasStbsv cublasStbsv_v2
+#define cublasDtbsv cublasDtbsv_v2
+#define cublasCtbsv cublasCtbsv_v2
+#define cublasZtbsv cublasZtbsv_v2
+#define cublasSsymv cublasSsymv_v2
+#define cublasDsymv cublasDsymv_v2
+#define cublasCsymv cublasCsymv_v2
+#define cublasZsymv cublasZsymv_v2
+#define cublasChemv cublasChemv_v2
+#define cublasZhemv cublasZhemv_v2
+#define cublasSsbmv cublasSsbmv_v2
+#define cublasDsbmv cublasDsbmv_v2
+#define cublasChbmv cublasChbmv_v2
+#define cublasZhbmv cublasZhbmv_v2
+#define cublasSspmv cublasSspmv_v2
+#define cublasDspmv cublasDspmv_v2
+#define cublasChpmv cublasChpmv_v2
+#define cublasZhpmv cublasZhpmv_v2
+#define cublasSger cublasSger_v2
+#define cublasDger cublasDger_v2
+#define cublasCgeru cublasCgeru_v2
+#define cublasCgerc cublasCgerc_v2
+#define cublasZgeru cublasZgeru_v2
+#define cublasZgerc cublasZgerc_v2
+#define cublasSsyr cublasSsyr_v2
+#define cublasDsyr cublasDsyr_v2
+#define cublasCsyr cublasCsyr_v2
+#define cublasZsyr cublasZsyr_v2
+#define cublasCher cublasCher_v2
+#define cublasZher cublasZher_v2
+#define cublasSspr cublasSspr_v2
+#define cublasDspr cublasDspr_v2
+#define cublasChpr cublasChpr_v2
+#define cublasZhpr cublasZhpr_v2
+#define cublasSsyr2 cublasSsyr2_v2
+#define cublasDsyr2 cublasDsyr2_v2
+#define cublasCsyr2 cublasCsyr2_v2
+#define cublasZsyr2 cublasZsyr2_v2
+#define cublasCher2 cublasCher2_v2
+#define cublasZher2 cublasZher2_v2
+#define cublasSspr2 cublasSspr2_v2
+#define cublasDspr2 cublasDspr2_v2
+#define cublasChpr2 cublasChpr2_v2
+#define cublasZhpr2 cublasZhpr2_v2
+/* Blas3 Routines   */
+#define cublasSgemm cublasSgemm_v2
+#define cublasDgemm cublasDgemm_v2
+#define cublasCgemm cublasCgemm_v2
+#define cublasZgemm cublasZgemm_v2
+#define cublasSsyrk cublasSsyrk_v2
+#define cublasDsyrk cublasDsyrk_v2
+#define cublasCsyrk cublasCsyrk_v2
+#define cublasZsyrk cublasZsyrk_v2
+#define cublasCherk cublasCherk_v2
+#define cublasZherk cublasZherk_v2
+#define cublasSsyr2k cublasSsyr2k_v2
+#define cublasDsyr2k cublasDsyr2k_v2
+#define cublasCsyr2k cublasCsyr2k_v2
+#define cublasZsyr2k cublasZsyr2k_v2
+#define cublasCher2k cublasCher2k_v2
+#define cublasZher2k cublasZher2k_v2
+#define cublasSsymm cublasSsymm_v2
+#define cublasDsymm cublasDsymm_v2
+#define cublasCsymm cublasCsymm_v2
+#define cublasZsymm cublasZsymm_v2
+#define cublasChemm cublasChemm_v2
+#define cublasZhemm cublasZhemm_v2
+#define cublasStrsm cublasStrsm_v2
+#define cublasDtrsm cublasDtrsm_v2
+#define cublasCtrsm cublasCtrsm_v2
+#define cublasZtrsm cublasZtrsm_v2
+#define cublasStrmm cublasStrmm_v2
+#define cublasDtrmm cublasDtrmm_v2
+#define cublasCtrmm cublasCtrmm_v2
+#define cublasZtrmm cublasZtrmm_v2
+/* 64-bit integer */
+/* Blas1 Routines   */
+#define cublasSnrm2_64 cublasSnrm2_v2_64
+#define cublasDnrm2_64 cublasDnrm2_v2_64
+#define cublasScnrm2_64 cublasScnrm2_v2_64
+#define cublasDznrm2_64 cublasDznrm2_v2_64
+#define cublasSdot_64 cublasSdot_v2_64
+#define cublasDdot_64 cublasDdot_v2_64
+#define cublasCdotu_64 cublasCdotu_v2_64
+#define cublasCdotc_64 cublasCdotc_v2_64
+#define cublasZdotu_64 cublasZdotu_v2_64
+#define cublasZdotc_64 cublasZdotc_v2_64
+#define cublasSscal_64 cublasSscal_v2_64
+#define cublasDscal_64 cublasDscal_v2_64
+#define cublasCscal_64 cublasCscal_v2_64
+#define cublasCsscal_64 cublasCsscal_v2_64
+#define cublasZscal_64 cublasZscal_v2_64
+#define cublasZdscal_64 cublasZdscal_v2_64
+#define cublasSaxpy_64 cublasSaxpy_v2_64
+#define cublasDaxpy_64 cublasDaxpy_v2_64
+#define cublasCaxpy_64 cublasCaxpy_v2_64
+#define cublasZaxpy_64 cublasZaxpy_v2_64
+#define cublasScopy_64 cublasScopy_v2_64
+#define cublasDcopy_64 cublasDcopy_v2_64
+#define cublasCcopy_64 cublasCcopy_v2_64
+#define cublasZcopy_64 cublasZcopy_v2_64
+#define cublasSswap_64 cublasSswap_v2_64
+#define cublasDswap_64 cublasDswap_v2_64
+#define cublasCswap_64 cublasCswap_v2_64
+#define cublasZswap_64 cublasZswap_v2_64
+#define cublasIsamax_64 cublasIsamax_v2_64
+#define cublasIdamax_64 cublasIdamax_v2_64
+#define cublasIcamax_64 cublasIcamax_v2_64
+#define cublasIzamax_64 cublasIzamax_v2_64
+#define cublasIsamin_64 cublasIsamin_v2_64
+#define cublasIdamin_64 cublasIdamin_v2_64
+#define cublasIcamin_64 cublasIcamin_v2_64
+#define cublasIzamin_64 cublasIzamin_v2_64
+#define cublasSasum_64 cublasSasum_v2_64
+#define cublasDasum_64 cublasDasum_v2_64
+#define cublasScasum_64 cublasScasum_v2_64
+#define cublasDzasum_64 cublasDzasum_v2_64
+#define cublasSrot_64 cublasSrot_v2_64
+#define cublasDrot_64 cublasDrot_v2_64
+#define cublasCrot_64 cublasCrot_v2_64
+#define cublasCsrot_64 cublasCsrot_v2_64
+#define cublasZrot_64 cublasZrot_v2_64
+#define cublasZdrot_64 cublasZdrot_v2_64
+#define cublasSrotg_64 cublasSrotg_v2_64
+#define cublasDrotg_64 cublasDrotg_v2_64
+#define cublasCrotg_64 cublasCrotg_v2_64
+#define cublasZrotg_64 cublasZrotg_v2_64
+#define cublasSrotm_64 cublasSrotm_v2_64
+#define cublasDrotm_64 cublasDrotm_v2_64
+#define cublasSrotmg_64 cublasSrotmg_v2_64
+#define cublasDrotmg_64 cublasDrotmg_v2_64
+/* Blas2 Routines */
+#define cublasSgemv_64 cublasSgemv_v2_64
+#define cublasDgemv_64 cublasDgemv_v2_64
+#define cublasCgemv_64 cublasCgemv_v2_64
+#define cublasZgemv_64 cublasZgemv_v2_64
+#define cublasSgbmv_64 cublasSgbmv_v2_64
+#define cublasDgbmv_64 cublasDgbmv_v2_64
+#define cublasCgbmv_64 cublasCgbmv_v2_64
+#define cublasZgbmv_64 cublasZgbmv_v2_64
+#define cublasStrmv_64 cublasStrmv_v2_64
+#define cublasDtrmv_64 cublasDtrmv_v2_64
+#define cublasCtrmv_64 cublasCtrmv_v2_64
+#define cublasZtrmv_64 cublasZtrmv_v2_64
+#define cublasStbmv_64 cublasStbmv_v2_64
+#define cublasDtbmv_64 cublasDtbmv_v2_64
+#define cublasCtbmv_64 cublasCtbmv_v2_64
+#define cublasZtbmv_64 cublasZtbmv_v2_64
+#define cublasStpmv_64 cublasStpmv_v2_64
+#define cublasDtpmv_64 cublasDtpmv_v2_64
+#define cublasCtpmv_64 cublasCtpmv_v2_64
+#define cublasZtpmv_64 cublasZtpmv_v2_64
+#define cublasStrsv_64 cublasStrsv_v2_64
+#define cublasDtrsv_64 cublasDtrsv_v2_64
+#define cublasCtrsv_64 cublasCtrsv_v2_64
+#define cublasZtrsv_64 cublasZtrsv_v2_64
+#define cublasStpsv_64 cublasStpsv_v2_64
+#define cublasDtpsv_64 cublasDtpsv_v2_64
+#define cublasCtpsv_64 cublasCtpsv_v2_64
+#define cublasZtpsv_64 cublasZtpsv_v2_64
+#define cublasStbsv_64 cublasStbsv_v2_64
+#define cublasDtbsv_64 cublasDtbsv_v2_64
+#define cublasCtbsv_64 cublasCtbsv_v2_64
+#define cublasZtbsv_64 cublasZtbsv_v2_64
+#define cublasSsymv_64 cublasSsymv_v2_64
+#define cublasDsymv_64 cublasDsymv_v2_64
+#define cublasCsymv_64 cublasCsymv_v2_64
+#define cublasZsymv_64 cublasZsymv_v2_64
+#define cublasChemv_64 cublasChemv_v2_64
+#define cublasZhemv_64 cublasZhemv_v2_64
+#define cublasSsbmv_64 cublasSsbmv_v2_64
+#define cublasDsbmv_64 cublasDsbmv_v2_64
+#define cublasChbmv_64 cublasChbmv_v2_64
+#define cublasZhbmv_64 cublasZhbmv_v2_64
+#define cublasSspmv_64 cublasSspmv_v2_64
+#define cublasDspmv_64 cublasDspmv_v2_64
+#define cublasChpmv_64 cublasChpmv_v2_64
+#define cublasZhpmv_64 cublasZhpmv_v2_64
+#define cublasSger_64 cublasSger_v2_64
+#define cublasDger_64 cublasDger_v2_64
+#define cublasCgeru_64 cublasCgeru_v2_64
+#define cublasCgerc_64 cublasCgerc_v2_64
+#define cublasZgeru_64 cublasZgeru_v2_64
+#define cublasZgerc_64 cublasZgerc_v2_64
+#define cublasSsyr_64 cublasSsyr_v2_64
+#define cublasDsyr_64 cublasDsyr_v2_64
+#define cublasCsyr_64 cublasCsyr_v2_64
+#define cublasZsyr_64 cublasZsyr_v2_64
+#define cublasCher_64 cublasCher_v2_64
+#define cublasZher_64 cublasZher_v2_64
+#define cublasSspr_64 cublasSspr_v2_64
+#define cublasDspr_64 cublasDspr_v2_64
+#define cublasChpr_64 cublasChpr_v2_64
+#define cublasZhpr_64 cublasZhpr_v2_64
+#define cublasSsyr2_64 cublasSsyr2_v2_64
+#define cublasDsyr2_64 cublasDsyr2_v2_64
+#define cublasCsyr2_64 cublasCsyr2_v2_64
+#define cublasZsyr2_64 cublasZsyr2_v2_64
+#define cublasCher2_64 cublasCher2_v2_64
+#define cublasZher2_64 cublasZher2_v2_64
+#define cublasSspr2_64 cublasSspr2_v2_64
+#define cublasDspr2_64 cublasDspr2_v2_64
+#define cublasChpr2_64 cublasChpr2_v2_64
+#define cublasZhpr2_64 cublasZhpr2_v2_64
+/* Blas3 Routines   */
+#define cublasSgemm_64 cublasSgemm_v2_64
+#define cublasDgemm_64 cublasDgemm_v2_64
+#define cublasCgemm_64 cublasCgemm_v2_64
+#define cublasZgemm_64 cublasZgemm_v2_64
+#define cublasSsyrk_64 cublasSsyrk_v2_64
+#define cublasDsyrk_64 cublasDsyrk_v2_64
+#define cublasCsyrk_64 cublasCsyrk_v2_64
+#define cublasZsyrk_64 cublasZsyrk_v2_64
+#define cublasCherk_64 cublasCherk_v2_64
+#define cublasZherk_64 cublasZherk_v2_64
+#define cublasSsyr2k_64 cublasSsyr2k_v2_64
+#define cublasDsyr2k_64 cublasDsyr2k_v2_64
+#define cublasCsyr2k_64 cublasCsyr2k_v2_64
+#define cublasZsyr2k_64 cublasZsyr2k_v2_64
+#define cublasCher2k_64 cublasCher2k_v2_64
+#define cublasZher2k_64 cublasZher2k_v2_64
+#define cublasSsymm_64 cublasSsymm_v2_64
+#define cublasDsymm_64 cublasDsymm_v2_64
+#define cublasCsymm_64 cublasCsymm_v2_64
+#define cublasZsymm_64 cublasZsymm_v2_64
+#define cublasChemm_64 cublasChemm_v2_64
+#define cublasZhemm_64 cublasZhemm_v2_64
+#define cublasStrsm_64 cublasStrsm_v2_64
+#define cublasDtrsm_64 cublasDtrsm_v2_64
+#define cublasCtrsm_64 cublasCtrsm_v2_64
+#define cublasZtrsm_64 cublasZtrsm_v2_64
+#define cublasStrmm_64 cublasStrmm_v2_64
+#define cublasDtrmm_64 cublasDtrmm_v2_64
+#define cublasCtrmm_64 cublasCtrmm_v2_64
+#define cublasZtrmm_64 cublasZtrmm_v2_64
+#endif /* !defined(CUBLAS_V2_H_) */

.venv/lib/python3.11/site-packages/nvidia/cublas/include/nvblas.h ADDED Viewed

	@@ -0,0 +1,824 @@

+/*
+ * Copyright 1993-2019 NVIDIA Corporation. All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(NVBLAS_H_)
+#define NVBLAS_H_
+#include "driver_types.h"
+#include "cuComplex.h" /* import complex data type */
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* GEMM */
+void sgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* b,
+            const int* ldb,
+            const float* beta,
+            float* c,
+            const int* ldc);
+void dgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* b,
+            const int* ldb,
+            const double* beta,
+            double* c,
+            const int* ldc);
+void cgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+void zgemm_(const char* transa,
+            const char* transb,
+            const int* m,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+void sgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           const float* b,
+           const int* ldb,
+           const float* beta,
+           float* c,
+           const int* ldc);
+void dgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           const double* b,
+           const int* ldb,
+           const double* beta,
+           double* c,
+           const int* ldc);
+void cgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* b,
+           const int* ldb,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+void zgemm(const char* transa,
+           const char* transb,
+           const int* m,
+           const int* n,
+           const int* k,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* b,
+           const int* ldb,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+/* SYRK */
+void ssyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* beta,
+            float* c,
+            const int* ldc);
+void dsyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* beta,
+            double* c,
+            const int* ldc);
+void csyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+void zsyrk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+void ssyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           const float* beta,
+           float* c,
+           const int* ldc);
+void dsyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           const double* beta,
+           double* c,
+           const int* ldc);
+void csyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+void zsyrk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+/* HERK */
+void cherk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const float* beta,
+            cuComplex* c,
+            const int* ldc);
+void zherk_(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const double* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+void cherk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const float* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const float* beta,
+           cuComplex* c,
+           const int* ldc);
+void zherk(const char* uplo,
+           const char* trans,
+           const int* n,
+           const int* k,
+           const double* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const double* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+/* TRSM */
+void strsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            float* b,
+            const int* ldb);
+void dtrsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            double* b,
+            const int* ldb);
+void ctrsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            cuComplex* b,
+            const int* ldb);
+void ztrsm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            cuDoubleComplex* b,
+            const int* ldb);
+void strsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           float* b,
+           const int* ldb);
+void dtrsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           double* b,
+           const int* ldb);
+void ctrsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           cuComplex* b,
+           const int* ldb);
+void ztrsm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           cuDoubleComplex* b,
+           const int* ldb);
+/* SYMM */
+void ssymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* b,
+            const int* ldb,
+            const float* beta,
+            float* c,
+            const int* ldc);
+void dsymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* b,
+            const int* ldb,
+            const double* beta,
+            double* c,
+            const int* ldc);
+void csymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+void zsymm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+void ssymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           const float* b,
+           const int* ldb,
+           const float* beta,
+           float* c,
+           const int* ldc);
+void dsymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           const double* b,
+           const int* ldb,
+           const double* beta,
+           double* c,
+           const int* ldc);
+void csymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* b,
+           const int* ldb,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+void zsymm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* b,
+           const int* ldb,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+/* HEMM */
+void chemm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+void zhemm_(const char* side,
+            const char* uplo,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+/* HEMM with no underscore*/
+void chemm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           const cuComplex* b,
+           const int* ldb,
+           const cuComplex* beta,
+           cuComplex* c,
+           const int* ldc);
+void zhemm(const char* side,
+           const char* uplo,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           const cuDoubleComplex* b,
+           const int* ldb,
+           const cuDoubleComplex* beta,
+           cuDoubleComplex* c,
+           const int* ldc);
+/* SYR2K */
+void ssyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const float* alpha,
+             const float* a,
+             const int* lda,
+             const float* b,
+             const int* ldb,
+             const float* beta,
+             float* c,
+             const int* ldc);
+void dsyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const double* alpha,
+             const double* a,
+             const int* lda,
+             const double* b,
+             const int* ldb,
+             const double* beta,
+             double* c,
+             const int* ldc);
+void csyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuComplex* alpha,
+             const cuComplex* a,
+             const int* lda,
+             const cuComplex* b,
+             const int* ldb,
+             const cuComplex* beta,
+             cuComplex* c,
+             const int* ldc);
+void zsyr2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuDoubleComplex* alpha,
+             const cuDoubleComplex* a,
+             const int* lda,
+             const cuDoubleComplex* b,
+             const int* ldb,
+             const cuDoubleComplex* beta,
+             cuDoubleComplex* c,
+             const int* ldc);
+/* SYR2K no_underscore*/
+void ssyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            const float* b,
+            const int* ldb,
+            const float* beta,
+            float* c,
+            const int* ldc);
+void dsyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            const double* b,
+            const int* ldb,
+            const double* beta,
+            double* c,
+            const int* ldc);
+void csyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const cuComplex* beta,
+            cuComplex* c,
+            const int* ldc);
+void zsyr2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const cuDoubleComplex* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+/* HERK */
+void cher2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuComplex* alpha,
+             const cuComplex* a,
+             const int* lda,
+             const cuComplex* b,
+             const int* ldb,
+             const float* beta,
+             cuComplex* c,
+             const int* ldc);
+void zher2k_(const char* uplo,
+             const char* trans,
+             const int* n,
+             const int* k,
+             const cuDoubleComplex* alpha,
+             const cuDoubleComplex* a,
+             const int* lda,
+             const cuDoubleComplex* b,
+             const int* ldb,
+             const double* beta,
+             cuDoubleComplex* c,
+             const int* ldc);
+/* HER2K with no underscore */
+void cher2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            const cuComplex* b,
+            const int* ldb,
+            const float* beta,
+            cuComplex* c,
+            const int* ldc);
+void zher2k(const char* uplo,
+            const char* trans,
+            const int* n,
+            const int* k,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            const cuDoubleComplex* b,
+            const int* ldb,
+            const double* beta,
+            cuDoubleComplex* c,
+            const int* ldc);
+/* TRMM */
+void strmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const float* alpha,
+            const float* a,
+            const int* lda,
+            float* b,
+            const int* ldb);
+void dtrmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const double* alpha,
+            const double* a,
+            const int* lda,
+            double* b,
+            const int* ldb);
+void ctrmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuComplex* alpha,
+            const cuComplex* a,
+            const int* lda,
+            cuComplex* b,
+            const int* ldb);
+void ztrmm_(const char* side,
+            const char* uplo,
+            const char* transa,
+            const char* diag,
+            const int* m,
+            const int* n,
+            const cuDoubleComplex* alpha,
+            const cuDoubleComplex* a,
+            const int* lda,
+            cuDoubleComplex* b,
+            const int* ldb);
+void strmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const float* alpha,
+           const float* a,
+           const int* lda,
+           float* b,
+           const int* ldb);
+void dtrmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const double* alpha,
+           const double* a,
+           const int* lda,
+           double* b,
+           const int* ldb);
+void ctrmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuComplex* alpha,
+           const cuComplex* a,
+           const int* lda,
+           cuComplex* b,
+           const int* ldb);
+void ztrmm(const char* side,
+           const char* uplo,
+           const char* transa,
+           const char* diag,
+           const int* m,
+           const int* n,
+           const cuDoubleComplex* alpha,
+           const cuDoubleComplex* a,
+           const int* lda,
+           cuDoubleComplex* b,
+           const int* ldb);
+#if defined(__cplusplus)
+}
+#endif /* __cplusplus */
+#endif /* !defined(NVBLAS_H_) */

.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/cublas/lib/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (190 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/cublas/lib/libnvblas.so.12 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c2a58dc54154208392301d0fe3d53a120e4c1ebeab9e80ce91fe9948baeadc9
+size 757496

.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (190 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (198 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/include/nvrtc.h ADDED Viewed

	@@ -0,0 +1,869 @@

+//
+// NVIDIA_COPYRIGHT_BEGIN
+//
+// Copyright (c) 2014-2023, NVIDIA CORPORATION.  All rights reserved.
+//
+// NVIDIA CORPORATION and its licensors retain all intellectual property
+// and proprietary rights in and to this software, related documentation
+// and any modifications thereto.  Any use, reproduction, disclosure or
+// distribution of this software and related documentation without an express
+// license agreement from NVIDIA CORPORATION is strictly prohibited.
+//
+// NVIDIA_COPYRIGHT_END
+//
+#ifndef __NVRTC_H__
+#define __NVRTC_H__
+#ifdef __cplusplus
+extern "C" {
+#endif /* __cplusplus */
+#include <stdlib.h>
+/*************************************************************************//**
+ *
+ * \defgroup error Error Handling
+ *
+ * NVRTC defines the following enumeration type and function for API call
+ * error handling.
+ *
+ ****************************************************************************/
+/**
+ * \ingroup error
+ * \brief   The enumerated type nvrtcResult defines API call result codes.
+ *          NVRTC API functions return nvrtcResult to indicate the call
+ *          result.
+ */
+typedef enum {
+  NVRTC_SUCCESS = 0,
+  NVRTC_ERROR_OUT_OF_MEMORY = 1,
+  NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2,
+  NVRTC_ERROR_INVALID_INPUT = 3,
+  NVRTC_ERROR_INVALID_PROGRAM = 4,
+  NVRTC_ERROR_INVALID_OPTION = 5,
+  NVRTC_ERROR_COMPILATION = 6,
+  NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7,
+  NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8,
+  NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9,
+  NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10,
+  NVRTC_ERROR_INTERNAL_ERROR = 11,
+  NVRTC_ERROR_TIME_FILE_WRITE_FAILED = 12
+} nvrtcResult;
+/**
+ * \ingroup error
+ * \brief   nvrtcGetErrorString is a helper function that returns a string
+ *          describing the given nvrtcResult code, e.g., NVRTC_SUCCESS to
+ *          \c "NVRTC_SUCCESS".
+ *          For unrecognized enumeration values, it returns
+ *          \c "NVRTC_ERROR unknown".
+ *
+ * \param   [in] result CUDA Runtime Compilation API result code.
+ * \return  Message string for the given #nvrtcResult code.
+ */
+const char *nvrtcGetErrorString(nvrtcResult result);
+/*************************************************************************//**
+ *
+ * \defgroup query General Information Query
+ *
+ * NVRTC defines the following function for general information query.
+ *
+ ****************************************************************************/
+/**
+ * \ingroup query
+ * \brief   nvrtcVersion sets the output parameters \p major and \p minor
+ *          with the CUDA Runtime Compilation version number.
+ *
+ * \param   [out] major CUDA Runtime Compilation major version number.
+ * \param   [out] minor CUDA Runtime Compilation minor version number.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *
+ */
+nvrtcResult nvrtcVersion(int *major, int *minor);
+/**
+ * \ingroup query
+ * \brief   nvrtcGetNumSupportedArchs sets the output parameter \p numArchs
+ *          with the number of architectures supported by NVRTC. This can
+ *          then be used to pass an array to ::nvrtcGetSupportedArchs to
+ *          get the supported architectures.
+ *
+ * \param   [out] numArchs number of supported architectures.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *
+ * see    ::nvrtcGetSupportedArchs
+ */
+nvrtcResult nvrtcGetNumSupportedArchs(int* numArchs);
+/**
+ * \ingroup query
+ * \brief   nvrtcGetSupportedArchs populates the array passed via the output parameter
+ *          \p supportedArchs with the architectures supported by NVRTC. The array is
+ *          sorted in the ascending order. The size of the array to be passed can be
+ *          determined using ::nvrtcGetNumSupportedArchs.
+ *
+ * \param   [out] supportedArchs sorted array of supported architectures.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *
+ * see    ::nvrtcGetNumSupportedArchs
+ */
+nvrtcResult nvrtcGetSupportedArchs(int* supportedArchs);
+/*************************************************************************//**
+ *
+ * \defgroup compilation Compilation
+ *
+ * NVRTC defines the following type and functions for actual compilation.
+ *
+ ****************************************************************************/
+/**
+ * \ingroup compilation
+ * \brief   nvrtcProgram is the unit of compilation, and an opaque handle for
+ *          a program.
+ *
+ * To compile a CUDA program string, an instance of nvrtcProgram must be
+ * created first with ::nvrtcCreateProgram, then compiled with
+ * ::nvrtcCompileProgram.
+ */
+typedef struct _nvrtcProgram *nvrtcProgram;
+/**
+ * \ingroup compilation
+ * \brief   nvrtcCreateProgram creates an instance of nvrtcProgram with the
+ *          given input parameters, and sets the output parameter \p prog with
+ *          it.
+ *
+ * \param   [out] prog         CUDA Runtime Compilation program.
+ * \param   [in]  src          CUDA program source.
+ * \param   [in]  name         CUDA program name.\n
+ *                             \p name can be \c NULL; \c "default_program" is
+ *                             used when \p name is \c NULL or "".
+ * \param   [in]  numHeaders   Number of headers used.\n
+ *                             \p numHeaders must be greater than or equal to 0.
+ * \param   [in]  headers      Sources of the headers.\n
+ *                             \p headers can be \c NULL when \p numHeaders is
+ *                             0.
+ * \param   [in]  includeNames Name of each header by which they can be
+ *                             included in the CUDA program source.\n
+ *                             \p includeNames can be \c NULL when \p numHeaders
+ *                             is 0. These headers must be included with the exact
+ *                             names specified here.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_PROGRAM_CREATION_FAILURE \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcDestroyProgram
+ */
+nvrtcResult nvrtcCreateProgram(nvrtcProgram *prog,
+                               const char *src,
+                               const char *name,
+                               int numHeaders,
+                               const char * const *headers,
+                               const char * const *includeNames);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcDestroyProgram destroys the given program.
+ *
+ * \param    [in] prog CUDA Runtime Compilation program.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcCreateProgram
+ */
+nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcCompileProgram compiles the given program.
+ *
+ * \param   [in] prog       CUDA Runtime Compilation program.
+ * \param   [in] numOptions Number of compiler options passed.
+ * \param   [in] options    Compiler options in the form of C string array.\n
+ *                          \p options can be \c NULL when \p numOptions is 0.
+ *
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_OUT_OF_MEMORY \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_OPTION \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_COMPILATION \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_BUILTIN_OPERATION_FAILURE \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_TIME_FILE_WRITE_FAILED \endlink
+ *
+ * It supports compile options listed in \ref options.
+ */
+nvrtcResult nvrtcCompileProgram(nvrtcProgram prog,
+                                int numOptions, const char * const *options);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetPTXSize sets the value of \p ptxSizeRet with the size of the PTX
+ *          generated by the previous compilation of \p prog (including the
+ *          trailing \c NULL).
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] ptxSizeRet Size of the generated PTX (including the trailing
+ *                           \c NULL).
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetPTX
+ */
+nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog, size_t *ptxSizeRet);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetPTX stores the PTX generated by the previous compilation
+ *          of \p prog in the memory pointed by \p ptx.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] ptx  Compiled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetPTXSize
+ */
+nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetCUBINSize sets the value of \p cubinSizeRet with the size of the cubin
+ *          generated by the previous compilation of \p prog. The value of
+ *          cubinSizeRet is set to 0 if the value specified to \c -arch is a
+ *          virtual architecture instead of an actual architecture.
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] cubinSizeRet Size of the generated cubin.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetCUBIN
+ */
+nvrtcResult nvrtcGetCUBINSize(nvrtcProgram prog, size_t *cubinSizeRet);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetCUBIN stores the cubin generated by the previous compilation
+ *          of \p prog in the memory pointed by \p cubin. No cubin is available
+ *          if the value specified to \c -arch is a virtual architecture instead
+ *          of an actual architecture.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] cubin  Compiled and assembled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetCUBINSize
+ */
+nvrtcResult nvrtcGetCUBIN(nvrtcProgram prog, char *cubin);
+#if defined(_WIN32)
+# define __DEPRECATED__(msg) __declspec(deprecated(msg))
+#elif (defined(__GNUC__) && (__GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 5 && !defined(__clang__))))
+# define __DEPRECATED__(msg) __attribute__((deprecated))
+#elif (defined(__GNUC__))
+# define __DEPRECATED__(msg) __attribute__((deprecated(msg)))
+#else
+# define __DEPRECATED__(msg)
+#endif
+/**
+ * \ingroup compilation
+ * \brief
+ * DEPRECATION NOTICE: This function will be removed in a future release. Please use
+ * nvrtcGetLTOIRSize (and nvrtcGetLTOIR) instead.
+ */
+__DEPRECATED__("This function will be removed in a future release. Please use nvrtcGetLTOIRSize instead")
+nvrtcResult nvrtcGetNVVMSize(nvrtcProgram prog, size_t *nvvmSizeRet);
+/**
+ * \ingroup compilation
+ * \brief
+ * DEPRECATION NOTICE: This function will be removed in a future release. Please use
+ * nvrtcGetLTOIR (and nvrtcGetLTOIRSize) instead.
+ */
+__DEPRECATED__("This function will be removed in a future release. Please use nvrtcGetLTOIR instead")
+nvrtcResult nvrtcGetNVVM(nvrtcProgram prog, char *nvvm);
+#undef __DEPRECATED__
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetLTOIRSize sets the value of \p LTOIRSizeRet with the size of the LTO IR
+ *          generated by the previous compilation of \p prog. The value of
+ *          LTOIRSizeRet is set to 0 if the program was not compiled with
+ *          \c -dlto.
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] LTOIRSizeRet Size of the generated LTO IR.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetLTOIR
+ */
+nvrtcResult nvrtcGetLTOIRSize(nvrtcProgram prog, size_t *LTOIRSizeRet);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetLTOIR stores the LTO IR generated by the previous compilation
+ *          of \p prog in the memory pointed by \p LTOIR. No LTO IR is available
+ *          if the program was compiled without \c -dlto.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] LTOIR Compiled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetLTOIRSize
+ */
+nvrtcResult nvrtcGetLTOIR(nvrtcProgram prog, char *LTOIR);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetOptiXIRSize sets the value of \p optixirSizeRet with the size of the OptiX IR
+ *          generated by the previous compilation of \p prog. The value of
+ *          nvrtcGetOptiXIRSize is set to 0 if the program was compiled with
+ *          options incompatible with OptiX IR generation.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] optixirSizeRet Size of the generated LTO IR.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetOptiXIR
+ */
+nvrtcResult nvrtcGetOptiXIRSize(nvrtcProgram prog, size_t *optixirSizeRet);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetOptiXIR stores the OptiX IR generated by the previous compilation
+ *          of \p prog in the memory pointed by \p optixir. No OptiX IR is available
+ *          if the program was compiled with options incompatible with OptiX IR generation.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] Optix IR Compiled result.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetOptiXIRSize
+ */
+nvrtcResult nvrtcGetOptiXIR(nvrtcProgram prog, char *optixir);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetProgramLogSize sets \p logSizeRet with the size of the
+ *          log generated by the previous compilation of \p prog (including the
+ *          trailing \c NULL).
+ *
+ * Note that compilation log may be generated with warnings and informative
+ * messages, even when the compilation of \p prog succeeds.
+ *
+ * \param   [in]  prog       CUDA Runtime Compilation program.
+ * \param   [out] logSizeRet Size of the compilation log
+ *                           (including the trailing \c NULL).
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetProgramLog
+ */
+nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog, size_t *logSizeRet);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetProgramLog stores the log generated by the previous
+ *          compilation of \p prog in the memory pointed by \p log.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [out] log  Compilation log.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *
+ * \see     ::nvrtcGetProgramLogSize
+ */
+nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog, char *log);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcAddNameExpression notes the given name expression
+ *          denoting the address of a __global__ function
+ *          or __device__/__constant__ variable.
+ *
+ * The identical name expression string must be provided on a subsequent
+ * call to nvrtcGetLoweredName to extract the lowered name.
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [in] name_expression constant expression denoting the address of
+ *               a __global__ function or __device__/__constant__ variable.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION \endlink
+ *
+ * \see     ::nvrtcGetLoweredName
+ */
+nvrtcResult nvrtcAddNameExpression(nvrtcProgram prog,
+                                   const char * const name_expression);
+/**
+ * \ingroup compilation
+ * \brief   nvrtcGetLoweredName extracts the lowered (mangled) name
+ *          for a __global__ function or __device__/__constant__ variable,
+ *          and updates *lowered_name to point to it. The memory containing
+ *          the name is released when the NVRTC program is destroyed by
+ *          nvrtcDestroyProgram.
+ *          The identical name expression must have been previously
+ *          provided to nvrtcAddNameExpression.
+ *
+ * \param   [in]  prog CUDA Runtime Compilation program.
+ * \param   [in] name_expression constant expression denoting the address of
+ *               a __global__ function or __device__/__constant__ variable.
+ * \param   [out] lowered_name initialized by the function to point to a
+ *               C string containing the lowered (mangled)
+ *               name corresponding to the provided name expression.
+ * \return
+ *   - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_PROGRAM \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_INVALID_INPUT \endlink
+ *   - \link #nvrtcResult NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID \endlink
+ *
+ * \see     ::nvrtcAddNameExpression
+ */
+nvrtcResult nvrtcGetLoweredName(nvrtcProgram prog,
+                                const char *const name_expression,
+                                const char** lowered_name);
+/**
+ * \defgroup options Supported Compile Options
+ *
+ * NVRTC supports the compile options below.
+ * Option names with two preceding dashs (\c --) are long option names and
+ * option names with one preceding dash (\c -) are short option names.
+ * Short option names can be used instead of long option names.
+ * When a compile option takes an argument, an assignment operator (\c =)
+ * is used to separate the compile option argument from the compile option
+ * name, e.g., \c "--gpu-architecture=compute_60".
+ * Alternatively, the compile option name and the argument can be specified in
+ * separate strings without an assignment operator, .e.g,
+ * \c "--gpu-architecture" \c "compute_60".
+ * Single-character short option names, such as \c -D, \c -U, and \c -I, do
+ * not require an assignment operator, and the compile option name and the
+ * argument can be present in the same string with or without spaces between
+ * them.
+ * For instance, \c "-D=<def>", \c "-D<def>", and \c "-D <def>" are all
+ * supported.
+ *
+ * The valid compiler options are:
+ *
+ *   - Compilation targets
+ *     - \c --gpu-architecture=\<arch\> (\c -arch)\n
+ *       Specify the name of the class of GPU architectures for which the
+ *       input must be compiled.\n
+ *       - Valid <c>\<arch\></c>s:
+ *         - \c compute_50
+ *         - \c compute_52
+ *         - \c compute_53
+ *         - \c compute_60
+ *         - \c compute_61
+ *         - \c compute_62
+ *         - \c compute_70
+ *         - \c compute_72
+ *         - \c compute_75
+ *         - \c compute_80
+ *         - \c compute_87
+ *         - \c compute_89
+ *         - \c compute_90
+ *         - \c compute_90a
+ *         - \c sm_50
+ *         - \c sm_52
+ *         - \c sm_53
+ *         - \c sm_60
+ *         - \c sm_61
+ *         - \c sm_62
+ *         - \c sm_70
+ *         - \c sm_72
+ *         - \c sm_75
+ *         - \c sm_80
+ *         - \c sm_87
+ *         - \c sm_89
+ *         - \c sm_90
+ *         - \c sm_90a
+ *       - Default: \c compute_52
+ *   - Separate compilation / whole-program compilation
+ *     - \c --device-c (\c -dc)\n
+ *       Generate relocatable code that can be linked with other relocatable
+ *       device code.  It is equivalent to --relocatable-device-code=true.
+ *     - \c --device-w (\c -dw)\n
+ *       Generate non-relocatable code.  It is equivalent to
+ *       \c --relocatable-device-code=false.
+ *     - \c --relocatable-device-code={true|false} (\c -rdc)\n
+ *       Enable (disable) the generation of relocatable device code.
+ *       - Default: \c false
+ *     - \c --extensible-whole-program (\c -ewp)\n
+ *       Do extensible whole program compilation of device code.
+ *       - Default: \c false
+ *   - Debugging support
+ *     - \c --device-debug (\c -G)\n
+ *       Generate debug information. If --dopt is not specified,
+ *       then turns off all optimizations.
+ *     - \c --generate-line-info (\c -lineinfo)\n
+ *       Generate line-number information.
+ *   - Code generation
+ *     - \c --dopt on (\c -dopt)\n
+ *     - \c --dopt=on \n
+ *       Enable device code optimization. When specified along with '-G', enables
+ *       limited debug information generation for optimized device code (currently,
+ *       only line number information).
+ *       When '-G' is not specified, '-dopt=on' is implicit.
+ *     - \c --ptxas-options \<options\> (\c -Xptxas)\n
+ *     - \c --ptxas-options=\<options\> \n
+ *       Specify options directly to ptxas, the PTX optimizing assembler.
+ *     - \c --maxrregcount=\<N\> (\c -maxrregcount)\n
+ *       Specify the maximum amount of registers that GPU functions can use.
+ *       Until a function-specific limit, a higher value will generally
+ *       increase the performance of individual GPU threads that execute this
+ *       function.  However, because thread registers are allocated from a
+ *       global register pool on each GPU, a higher value of this option will
+ *       also reduce the maximum thread block size, thereby reducing the amount
+ *       of thread parallelism.  Hence, a good maxrregcount value is the result
+ *       of a trade-off.  If this option is not specified, then no maximum is
+ *       assumed.  Value less than the minimum registers required by ABI will
+ *       be bumped up by the compiler to ABI minimum limit.
+ *     - \c --ftz={true|false} (\c -ftz)\n
+ *       When performing single-precision floating-point operations, flush
+ *       denormal values to zero or preserve denormal values.
+ *       \c --use_fast_math implies \c --ftz=true.
+ *       - Default: \c false
+ *     - \c --prec-sqrt={true|false} (\c -prec-sqrt)\n
+ *       For single-precision floating-point square root, use IEEE
+ *       round-to-nearest mode or use a faster approximation.
+ *       \c --use_fast_math implies \c --prec-sqrt=false.
+ *       - Default: \c true
+ *     - \c --prec-div={true|false} (\c -prec-div)\n
+ *       For single-precision floating-point division and reciprocals, use IEEE
+ *       round-to-nearest mode or use a faster approximation.
+ *       \c --use_fast_math implies \c --prec-div=false.
+ *       - Default: \c true
+ *     - \c --fmad={true|false} (\c -fmad)\n
+ *       Enables (disables) the contraction of floating-point multiplies and
+ *       adds/subtracts into floating-point multiply-add operations (FMAD,
+ *       FFMA, or DFMA).  \c --use_fast_math implies \c --fmad=true.
+ *       - Default: \c true
+ *     - \c --use_fast_math (\c -use_fast_math)\n
+ *       Make use of fast math operations.
+ *       \c --use_fast_math implies \c --ftz=true \c --prec-div=false
+ *       \c --prec-sqrt=false \c --fmad=true.
+ *     - \c --extra-device-vectorization (\c -extra-device-vectorization)\n
+ *       Enables more aggressive device code vectorization in the NVVM optimizer.
+ *     - \c --modify-stack-limit={true|false} (\c -modify-stack-limit)\n
+ *       On Linux, during compilation, use \c setrlimit() to increase stack size
+ *       to maximum allowed. The limit is reset to the previous value at the
+ *       end of compilation.
+ *       Note: \c setrlimit() changes the value for the entire process.
+ *       - Default: \c true
+ *     - \c --dlink-time-opt (\c -dlto)\n
+ *       Generate intermediate code for later link-time optimization.
+ *       It implies \c -rdc=true.
+ *       Note: when this option is used the nvrtcGetLTOIR API should be used,
+ *       as PTX or Cubin will not be generated.
+ *     - \c --gen-opt-lto (\c -gen-opt-lto)\n
+ *       Run the optimizer passes before generating the LTO IR.
+ *     - \c --optix-ir (\c -optix-ir)\n
+ *       Generate OptiX IR. The Optix IR is only intended for consumption by OptiX
+ *       through appropriate APIs. This feature is not supported with
+ *       link-time-optimization (\c -dlto)\n.
+ *       Note: when this option is used the nvrtcGetOptiX API should be used,
+ *       as PTX or Cubin will not be generated.
+ *     - \c --jump-table-density=[0-101] (\c -jtd)\n
+ *       Specify the case density percentage in switch statements, and use it as
+ *       a minimal threshold to determine whether jump table(brx.idx instruction)
+ *       will be used to implement a switch statement. Default value is 101. The
+ *       percentage ranges from 0 to 101 inclusively.
+ *   - Preprocessing
+ *     - \c --define-macro=\<def\> (\c -D)\n
+ *       \c \<def\> can be either \c \<name\> or \c \<name=definitions\>.
+ *       - \c \<name\> \n
+ *         Predefine \c \<name\> as a macro with definition \c 1.
+ *       - \c \<name\>=\<definition\> \n
+ *         The contents of \c \<definition\> are tokenized and preprocessed
+ *         as if they appeared during translation phase three in a \c \#define
+ *         directive.  In particular, the definition will be truncated by
+ *         embedded new line characters.
+ *     - \c --undefine-macro=\<def\> (\c -U)\n
+ *       Cancel any previous definition of \c \<def\>.
+ *     - \c --include-path=\<dir\> (\c -I)\n
+ *       Add the directory \c \<dir\> to the list of directories to be
+ *       searched for headers.  These paths are searched after the list of
+ *       headers given to ::nvrtcCreateProgram.
+ *     - \c --pre-include=\<header\> (\c -include)\n
+ *       Preinclude \c \<header\> during preprocessing.
+ *     - \c --no-source-include (\c -no-source-include)
+ *       The preprocessor by default adds the directory of each input sources
+ *       to the include path. This option disables this feature and only
+ *       considers the path specified explicitly.
+ *   - Language Dialect
+ *     - \c --std={c++03|c++11|c++14|c++17|c++20}
+ *       (\c -std={c++11|c++14|c++17|c++20})\n
+ *       Set language dialect to C++03, C++11, C++14, C++17 or C++20
+ *       - Default: \c c++17
+ *     - \c --builtin-move-forward={true|false} (\c -builtin-move-forward)\n
+ *       Provide builtin definitions of \c std::move and \c std::forward,
+ *       when C++11 or later language dialect is selected.
+ *       - Default: \c true
+ *     - \c --builtin-initializer-list={true|false}
+ *       (\c -builtin-initializer-list)\n
+ *       Provide builtin definitions of \c std::initializer_list class and
+ *       member functions when C++11 or later language dialect is selected.
+ *       - Default: \c true
+ *   - Misc.
+ *     - \c --disable-warnings (\c -w)\n
+ *       Inhibit all warning messages.
+ *     - \c --restrict (\c -restrict)\n
+ *       Programmer assertion that all kernel pointer parameters are restrict
+ *       pointers.
+ *     - \c --device-as-default-execution-space
+ *       (\c -default-device)\n
+ *       Treat entities with no execution space annotation as \c __device__
+ *       entities.
+ *     - \c --device-int128 (\c -device-int128)\n
+ *       Allow the \c __int128 type in device code. Also causes the macro \c __CUDACC_RTC_INT128__
+ *       to be defined.
+ *     - \c --optimization-info=\<kind\> (\c -opt-info)\n
+ *       Provide optimization reports for the specified kind of optimization.
+ *       The following kind tags are supported:
+ *         - \c inline : emit a remark when a function is inlined.
+ *     - \c --display-error-number (\c -err-no)\n
+ *       Display diagnostic number for warning messages. (Default)
+ *     - \c --no-display-error-number (\c -no-err-no)\n
+ *       Disables the display of a diagnostic number for warning messages.
+ *     - \c --diag-error=<error-number>,... (\c -diag-error)\n
+ *       Emit error for specified diagnostic message number(s). Message numbers can be separated by comma.
+ *     - \c --diag-suppress=<error-number>,... (\c -diag-suppress)\n
+ *       Suppress specified diagnostic message number(s). Message numbers can be separated by comma.
+ *     - \c --diag-warn=<error-number>,... (\c -diag-warn)\n
+ *       Emit warning for specified diagnostic message number(s). Message numbers can be separated by comma.
+ *     - \c --brief-diagnostics={true|false}  (\c -brief-diag)\n
+ *       This option disables or enables showing source line and column info
+ *       in a diagnostic.
+ *       The --brief-diagnostics=true will not show the source line and column info.
+ *       - Default: \c false
+ *     - \c --time=<file-name> (\c -time)\n
+ *        Generate a comma separated value table with the time taken by each compilation
+ *        phase, and append it at the end of the file given as the option argument.
+ *       If the file does not exist, the column headings are generated in the first row
+ *       of the table. If the file name is '-', the timing data is written to the compilation log.
+ *     - \c --split-compile=<number of threads> (\c -split-compile=<number of threads>)\n
+ *       Perform compiler optimizations in parallel.
+ *       Split compilation attempts to reduce compile time by enabling the compiler to run certain
+ *       optimization passes concurrently. This option accepts a numerical value that specifies the
+ *       maximum number of threads the compiler can use. One can also allow the compiler to use the maximum
+ *       threads available on the system by setting --split-compile=0.
+ *       Setting --split-compile=1 will cause this option to be ignored.
+ *     - \c --fdevice-syntax-only (\c -fdevice-syntax-only)\n
+ *       Ends device compilation after front-end syntax checking. This option does not generate valid
+ *       device code.
+ *     - \c --minimal  (\c -minimal)\n
+ *        Omit certain language features to reduce compile time for small programs.
+ *        In particular, the following are omitted:
+ *            - Texture and surface functions and associated types, e.g., \c cudaTextureObject_t.
+ *            - CUDA Runtime Functions that are provided by the cudadevrt device code library,
+ *              typically named with prefix "cuda", e.g., \c cudaMalloc.
+ *            - Kernel launch from device code.
+ *            - Types and macros associated with CUDA Runtime and Driver APIs,
+ *              provided by cuda/tools/cudart/driver_types.h, typically named with prefix "cuda", e.g., \c cudaError_t.
+ *
+ */
+#ifdef __cplusplus
+}
+#endif /* __cplusplus */
+/* The utility function 'nvrtcGetTypeName' is not available by default. Define
+   the macro 'NVRTC_GET_TYPE_NAME' to a non-zero value to make it available.
+*/
+#if NVRTC_GET_TYPE_NAME || __DOXYGEN_ONLY__
+#if NVRTC_USE_CXXABI || __clang__ || __GNUC__ || __DOXYGEN_ONLY__
+#include <cxxabi.h>
+#include <cstdlib>
+#elif defined(_WIN32)
+#include <Windows.h>
+#include <DbgHelp.h>
+#endif /* NVRTC_USE_CXXABI || __clang__ || __GNUC__ */
+#include <string>
+#include <typeinfo>
+template <typename T> struct __nvrtcGetTypeName_helper_t { };
+/*************************************************************************//**
+ *
+ * \defgroup hosthelper Host Helper
+ *
+ * NVRTC defines the following functions for easier interaction with host code.
+ *
+ ****************************************************************************/
+/**
+ * \ingroup hosthelper
+ * \brief   nvrtcGetTypeName stores the source level name of a type in the given
+ *          std::string location.
+ *
+ * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
+ * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
+ * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
+ * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
+ * otherwise *result is initialized with the extracted name.
+ *
+ * Windows-specific notes:
+ * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(),
+ *   which is not multi-thread safe.
+ * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
+ *
+ * \param   [in] tinfo: reference to object of type std::type_info for a given type.
+ * \param   [in] result: pointer to std::string in which to store the type name.
+ * \return
+ *  - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *  - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
+ *
+ */
+inline nvrtcResult nvrtcGetTypeName(const std::type_info &tinfo, std::string *result)
+{
+#if USE_CXXABI || __clang__ || __GNUC__
+  const char *name = tinfo.name();
+  int status;
+  char *undecorated_name = abi::__cxa_demangle(name, 0, 0, &status);
+  if (status == 0) {
+    *result = undecorated_name;
+    free(undecorated_name);
+    return NVRTC_SUCCESS;
+  }
+#elif defined(_WIN32)
+  const char *name = tinfo.raw_name();
+  if (!name || *name != '.') {
+    return NVRTC_ERROR_INTERNAL_ERROR;
+  }
+  char undecorated_name[4096];
+  //name+1 skips over the '.' prefix
+  if(UnDecorateSymbolName(name+1, undecorated_name,
+                          sizeof(undecorated_name) / sizeof(*undecorated_name),
+                           //note: doesn't seem to work correctly without UNDNAME_NO_ARGUMENTS.
+                           UNDNAME_NO_ARGUMENTS | UNDNAME_NAME_ONLY ) ) {
+    *result = undecorated_name;
+    return NVRTC_SUCCESS;
+  }
+#endif  /* USE_CXXABI || __clang__ || __GNUC__ */
+  return NVRTC_ERROR_INTERNAL_ERROR;
+}
+/**
+ * \ingroup hosthelper
+ * \brief   nvrtcGetTypeName stores the source level name of the template type argument
+ *          T in the given std::string location.
+ *
+ * This function is only provided when the macro NVRTC_GET_TYPE_NAME is
+ * defined with a non-zero value. It uses abi::__cxa_demangle or UnDecorateSymbolName
+ * function calls to extract the type name, when using gcc/clang or cl.exe compilers,
+ * respectively. If the name extraction fails, it will return NVRTC_INTERNAL_ERROR,
+ * otherwise *result is initialized with the extracted name.
+ *
+ * Windows-specific notes:
+ * - nvrtcGetTypeName() is not multi-thread safe because it calls UnDecorateSymbolName(),
+ *   which is not multi-thread safe.
+ * - The returned string may contain Microsoft-specific keywords such as __ptr64 and __cdecl.
+ *
+ * \param   [in] result: pointer to std::string in which to store the type name.
+ * \return
+ *  - \link #nvrtcResult NVRTC_SUCCESS \endlink
+ *  - \link #nvrtcResult NVRTC_ERROR_INTERNAL_ERROR \endlink
+ *
+ */
+template <typename T>
+nvrtcResult nvrtcGetTypeName(std::string *result)
+{
+  nvrtcResult res = nvrtcGetTypeName(typeid(__nvrtcGetTypeName_helper_t<T>),
+                                     result);
+  if (res != NVRTC_SUCCESS)
+    return res;
+  std::string repr = *result;
+  std::size_t idx = repr.find("__nvrtcGetTypeName_helper_t");
+  idx = (idx != std::string::npos) ? repr.find("<", idx) : idx;
+  std::size_t last_idx = repr.find_last_of('>');
+  if (idx == std::string::npos || last_idx == std::string::npos) {
+    return NVRTC_ERROR_INTERNAL_ERROR;
+  }
+  ++idx;
+  *result = repr.substr(idx, last_idx - idx);
+  return NVRTC_SUCCESS;
+}
+#endif  /* NVRTC_GET_TYPE_NAME */
+#endif /* __NVRTC_H__ */

.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/cuda_nvrtc/lib/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (194 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (192 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/async.h ADDED Viewed

	@@ -0,0 +1,452 @@

+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * The source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * The Licensed Deliverables contained herein are PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CG_ASYNC_H
+#define _CG_ASYNC_H
+#include "helpers.h"
+#include "info.h"
+#include <cuda_pipeline.h>
+_CG_BEGIN_NAMESPACE
+namespace details {
+// Groups supported by memcpy_async
+template <class TyGroup>
+struct _async_copy_group_supported : public _CG_STL_NAMESPACE::false_type {};
+template <unsigned int Sz, typename TyPar>
+struct _async_copy_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>>
+    : public _CG_STL_NAMESPACE::true_type {};
+template <>
+struct _async_copy_group_supported<cooperative_groups::coalesced_group> : public _CG_STL_NAMESPACE::true_type {};
+template <>
+struct _async_copy_group_supported<cooperative_groups::thread_block> : public _CG_STL_NAMESPACE::true_type {};
+template <class TyGroup>
+using async_copy_group_supported = _async_copy_group_supported<details::remove_qual<TyGroup>>;
+// Groups that require optimization
+template <class TyGroup>
+struct _async_copy_optimize_tile : public _CG_STL_NAMESPACE::false_type {};
+template <typename TyPar>
+struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<1, TyPar>>
+    : public _CG_STL_NAMESPACE::false_type {};
+template <unsigned int Sz, typename TyPar>
+struct _async_copy_optimize_tile<cooperative_groups::thread_block_tile<Sz, TyPar>>
+    : public _CG_STL_NAMESPACE::true_type {};
+template <class TyGroup>
+using async_copy_optimize_tile = _async_copy_optimize_tile<details::remove_qual<TyGroup>>;
+// SFINAE helpers for tile optimizations
+template <class TyGroup>
+using enable_tile_optimization =
+    typename _CG_STL_NAMESPACE::enable_if<async_copy_optimize_tile<TyGroup>::value, void *>::type;
+template <class TyGroup>
+using disable_tile_optimization =
+    typename _CG_STL_NAMESPACE::enable_if<!async_copy_optimize_tile<TyGroup>::value, void *>::type;
+// Segment for punning to aligned types
+template <unsigned int N>
+struct _Segment {
+    int _seg[N];
+};
+// Trivial layout guaranteed-aligned copy-async compatible segments
+template <unsigned int N>
+struct Segment;
+template <>
+struct __align__(4) Segment<1> : public _Segment<1>{};
+template <>
+struct __align__(8) Segment<2> : public _Segment<2>{};
+template <>
+struct __align__(16) Segment<4> : public _Segment<4>{};
+// Interleaved element by element copies from source to dest
+template <typename TyGroup, typename TyElem>
+_CG_STATIC_QUALIFIER void inline_copy(TyGroup &group, TyElem *__restrict__ dst, const TyElem *__restrict__ src,
+                                      size_t count) {
+    const unsigned int rank = group.thread_rank();
+    const unsigned int stride = group.size();
+    for (size_t idx = rank; idx < count; idx += stride) {
+        dst[idx] = src[idx];
+    }
+}
+template <typename TyGroup, typename TyElem, enable_tile_optimization<TyGroup> = nullptr>
+_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
+                                                 const TyElem *__restrict__ src, size_t count) {
+    static_assert(async_copy_group_supported<TyGroup>::value,
+                  "Async copy is only supported for groups that represent private shared memory");
+    if (count == 0) {
+        return;
+    }
+    const bool dstIsNotShared = !__isShared(dst);
+    const bool srcIsNotGlobal = !__isGlobal(src);
+    if (dstIsNotShared || srcIsNotGlobal) {
+        inline_copy(group, dst, src, count);
+        return;
+    }
+    const unsigned int stride = group.size();
+    const unsigned int rank = group.thread_rank();
+    // Efficient copies require warps to operate on the same amount of work at each step.
+    // remainders are handled in a separate stage to prevent branching
+    const unsigned int subWarpMask = (stride - 1);
+    const unsigned int subwarpCopies = (subWarpMask & (unsigned int)count);
+    const unsigned int maxSubwarpRank = min(rank, subwarpCopies - 1);
+    const size_t warpCopies = (count & (~subWarpMask));
+    for (size_t idx = 0; idx < warpCopies; idx += stride) {
+        size_t _srcIdx = rank + idx;
+        size_t _dstIdx = rank + idx;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+    if (subwarpCopies) {
+        size_t _srcIdx = warpCopies + maxSubwarpRank;
+        size_t _dstIdx = warpCopies + maxSubwarpRank;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+}
+template <typename TyGroup, typename TyElem, disable_tile_optimization<TyGroup> = nullptr>
+_CG_STATIC_QUALIFIER void accelerated_async_copy(TyGroup &group, TyElem *__restrict__ dst,
+                                                 const TyElem *__restrict__ src, size_t count) {
+    static_assert(async_copy_group_supported<TyGroup>::value,
+                  "Async copy is only supported for groups that represent private shared memory");
+    const bool dstIsNotShared = !__isShared(dst);
+    const bool srcIsNotGlobal = !__isGlobal(src);
+    if (dstIsNotShared || srcIsNotGlobal) {
+        inline_copy(group, dst, src, count);
+        return;
+    }
+    unsigned int stride = group.size();
+    unsigned int rank = group.thread_rank();
+    for (size_t idx = rank; idx < count; idx += stride) {
+        size_t _srcIdx = idx;
+        size_t _dstIdx = idx;
+        __pipeline_memcpy_async(dst + _dstIdx, src + _srcIdx, sizeof(TyElem));
+    }
+}
+// Determine best possible alignment given an input and initial conditions
+// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
+template <unsigned int MinAlignment, unsigned int MaxAlignment>
+_CG_STATIC_QUALIFIER uint32_t find_best_alignment(void *__restrict__ dst, const void *__restrict__ src) {
+    // Narrowing conversion intentional
+    uint32_t base1 = (uint32_t) reinterpret_cast<uintptr_t>(src);
+    uint32_t base2 = (uint32_t) reinterpret_cast<uintptr_t>(dst);
+    uint32_t diff = ((base1) ^ (base2)) & (MaxAlignment - 1);
+    // range [MaxAlignment, alignof(elem)], step: x >> 1
+    // over range of possible alignments, choose best available out of range
+    uint32_t out = MaxAlignment;
+#pragma unroll
+    for (uint32_t alignment = (MaxAlignment >> 1); alignment >= MinAlignment; alignment >>= 1) {
+        if (alignment & diff)
+            out = alignment;
+    }
+    return out;
+}
+// Determine best possible alignment given an input and initial conditions
+// Attempts to generate as little code as possible, most likely should only be used with 1 and 2 byte alignments
+template <typename TyType, typename TyGroup>
+_CG_STATIC_QUALIFIER void copy_like(const TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                    size_t count) {
+    const char *src = reinterpret_cast<const char *>(_src);
+    char *dst = reinterpret_cast<char *>(_dst);
+    constexpr uint32_t targetAlignment = (uint32_t)alignof(TyType);
+    uint32_t base = (uint32_t) reinterpret_cast<uintptr_t>(src);
+    uint32_t alignOffset = ((~base) + 1) & (targetAlignment - 1);
+    inline_copy(group, dst, src, alignOffset);
+    count -= alignOffset;
+    src += alignOffset;
+    dst += alignOffset;
+    // Copy using the best available alignment, async_copy expects n-datums, not bytes
+    size_t asyncCount = count / sizeof(TyType);
+    accelerated_async_copy(group, reinterpret_cast<TyType *>(dst), reinterpret_cast<const TyType *>(src), asyncCount);
+    asyncCount *= sizeof(TyType);
+    count -= asyncCount;
+    src += asyncCount;
+    dst += asyncCount;
+    inline_copy(group, dst, src, count);
+}
+// We must determine alignment and manually align src/dst ourselves
+template <size_t AlignHint>
+struct _memcpy_async_align_dispatch {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ dst, const void *__restrict__ src, size_t count) {
+        uint32_t alignment = find_best_alignment<AlignHint, 16>(dst, src);
+        // Avoid copying the extra bytes if desired copy count is smaller
+        alignment = count < alignment ? AlignHint : alignment;
+        switch (alignment) {
+        default:
+        case 1:
+            inline_copy(group, reinterpret_cast<char *>(dst), reinterpret_cast<const char *>(src), count);
+            break;
+        case 2:
+            inline_copy(group, reinterpret_cast<short *>(dst), reinterpret_cast<const short *>(src), count >> 1);
+            break;
+        case 4:
+            copy_like<Segment<1>>(group, dst, src, count);
+            break;
+        case 8:
+            copy_like<Segment<2>>(group, dst, src, count);
+            break;
+        case 16:
+            copy_like<Segment<4>>(group, dst, src, count);
+            break;
+        }
+    }
+};
+// Specialization for 4 byte alignments
+template <>
+struct _memcpy_async_align_dispatch<4> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<1> *src = reinterpret_cast<const Segment<1> *>(_src);
+        Segment<1> *dst = reinterpret_cast<Segment<1> *>(_dst);
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+// Specialization for 8 byte alignments
+template <>
+struct _memcpy_async_align_dispatch<8> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<2> *src = reinterpret_cast<const Segment<2> *>(_src);
+        Segment<2> *dst = reinterpret_cast<Segment<2> *>(_dst);
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+// Alignments over 16 are truncated to 16 and bypass alignment
+// This is the highest performing memcpy available
+template <>
+struct _memcpy_async_align_dispatch<16> {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER void copy(TyGroup &group, void *__restrict__ _dst, const void *__restrict__ _src,
+                                   size_t count) {
+        const Segment<4> *src = reinterpret_cast<const Segment<4> *>(_src);
+        Segment<4> *dst = reinterpret_cast<Segment<4> *>(_dst);
+        // Dispatch straight to aligned LDGSTS calls
+        accelerated_async_copy(group, dst, src, count / sizeof(*dst));
+    }
+};
+// byte-wide API
+template <size_t Alignment, class TyGroup>
+_CG_STATIC_QUALIFIER void _memcpy_async_dispatch_to_aligned_copy(const TyGroup &group, void *__restrict__ _dst,
+                                                                 const void *__restrict__ _src, size_t count) {
+    static_assert(!(Alignment & (Alignment - 1)), "Known static alignment dispatch must be a power of 2");
+    details::_memcpy_async_align_dispatch<Alignment>::copy(group, _dst, _src, count);
+}
+// Internal dispatch APIs
+// These deduce the alignments and sizes necessary to invoke the underlying copy engine
+template <typename Ty>
+using is_void = _CG_STL_NAMESPACE::is_same<Ty, void>;
+template <typename Ty>
+using enable_if_not_void = typename _CG_STL_NAMESPACE::enable_if<!is_void<Ty>::value, void *>::type;
+template <typename Ty>
+using enable_if_void = typename _CG_STL_NAMESPACE::enable_if<is_void<Ty>::value, void *>::type;
+template <typename Ty>
+using enable_if_integral =
+    typename _CG_STL_NAMESPACE::enable_if<_CG_STL_NAMESPACE::is_integral<Ty>::value, void *>::type;
+// byte-wide API using aligned_sized_t
+template <class TyGroup, template <size_t> typename Alignment, size_t Hint>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, void *__restrict__ _dst,
+                                              const void *__restrict__ _src, const Alignment<Hint> &count) {
+    constexpr size_t _align = (Hint > 16) ? 16 : Hint;
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, (size_t)count);
+}
+// byte-wide API using type for aligment
+template <class TyGroup, typename TyElem, typename TySize, size_t Hint = alignof(TyElem),
+          enable_if_not_void<TyElem> = nullptr, enable_if_integral<TySize> = nullptr>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
+                                              const TyElem *__restrict__ _src, const TySize& count) {
+    constexpr size_t _align = (Hint > 16) ? 16 : Hint;
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, _dst, _src, count);
+}
+// byte-wide API with full alignment deduction required
+template <class TyGroup, typename TyElem, typename TySize, enable_if_void<TyElem> = nullptr,
+          enable_if_integral<TySize> = nullptr>
+_CG_STATIC_QUALIFIER void _memcpy_async_bytes(const TyGroup &group, TyElem *__restrict__ _dst,
+                                              const TyElem *__restrict__ _src, const TySize& count) {
+    details::_memcpy_async_dispatch_to_aligned_copy<1>(group, _dst, _src, count);
+}
+// 1d-datum API
+template <class TyGroup, typename TyElem, size_t Hint = alignof(TyElem)>
+_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const size_t dstCount,
+                                              const TyElem *__restrict__ src, const size_t srcCount) {
+    constexpr unsigned int _align = Hint;
+    const size_t totalCount = min(dstCount, srcCount) * sizeof(TyElem);
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
+}
+// 1d-datum API using aligned_size_t
+template <class TyGroup, typename TyElem, template <size_t> typename Alignment, size_t Hint>
+_CG_STATIC_QUALIFIER void _memcpy_async_datum(const TyGroup &group, TyElem *__restrict__ dst, const Alignment<Hint> &dstCount,
+                                              const TyElem *__restrict__ src, const Alignment<Hint> &srcCount) {
+    constexpr unsigned int _align = Hint;
+    const size_t totalCount = min((size_t)dstCount, (size_t)srcCount) * sizeof(TyElem);
+    details::_memcpy_async_dispatch_to_aligned_copy<_align>(group, dst, src, totalCount);
+}
+} // namespace details
+/*
+ * Group submit batch of async-copy to cover contiguous 1D array
+ * and commit that batch to eventually wait for completion.
+ */
+template <class TyGroup, typename TyElem, typename TySizeT>
+_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ _dst, const TyElem *__restrict__ _src,
+                                       const TySizeT &count) {
+    details::_memcpy_async_bytes(group, _dst, _src, count);
+    __pipeline_commit();
+}
+/*
+ * Group submit batch of async-copy to cover contiguous 1D array
+ * and commit that batch to eventually wait for completion.
+ * Object counts are in datum sized chunks, not bytes.
+ */
+template <class TyGroup, class TyElem, typename DstLayout, typename SrcLayout>
+_CG_STATIC_QUALIFIER void memcpy_async(const TyGroup &group, TyElem *__restrict__ dst, const DstLayout &dstLayout,
+                                       const TyElem *__restrict__ src, const SrcLayout &srcLayout) {
+    details::_memcpy_async_datum(group, dst, dstLayout, src, srcLayout);
+    __pipeline_commit();
+}
+/* Group wait for prior Nth stage of memcpy_async to complete. */
+template <unsigned int Stage, class TyGroup>
+_CG_STATIC_QUALIFIER void wait_prior(const TyGroup &group) {
+    __pipeline_wait_prior(Stage);
+    group.sync();
+}
+/* Group wait all previously submitted memcpy_async to complete. */
+template <class TyGroup>
+_CG_STATIC_QUALIFIER void wait(const TyGroup &group) {
+    __pipeline_wait_prior(0);
+    group.sync();
+}
+/***************** CG APIs including pipeline are deprecated *****************/
+/* Group submit batch of async-copy to cover of contiguous 1D array
+   to a pipeline and commit the batch*/
+template <class TyGroup, class TyElem>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void memcpy_async(TyGroup &group, TyElem *dst, size_t dstCount, const TyElem *src, size_t srcCount,
+                                       nvcuda::experimental::pipeline &pipe) {
+    details::_memcpy_async_datum(group, dst, dstCount, src, srcCount);
+    pipe.commit();
+}
+/* Group wait for prior Nth stage of memcpy_async to complete. */
+template <unsigned int Stage, class TyGroup>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait_prior(TyGroup &group, nvcuda::experimental::pipeline &pipe) {
+    pipe.wait_prior<Stage>();
+    group.sync();
+}
+/* Group wait for stage-S of memcpy_async to complete. */
+template <class TyGroup>
+_CG_DEPRECATED _CG_STATIC_QUALIFIER void wait(TyGroup &group, nvcuda::experimental::pipeline &pipe, size_t stage) {
+    pipe.wait(stage);
+    group.sync();
+}
+_CG_END_NAMESPACE
+#endif // _CG_ASYNC_H

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/coalesced_scan.h ADDED Viewed

	@@ -0,0 +1,174 @@

+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _CG_COALESCED_SCAN_H_
+#define _CG_COALESCED_SCAN_H_
+#include "info.h"
+#include "helpers.h"
+#include "cooperative_groups.h"
+#include "partitioning.h"
+#include "functional.h"
+_CG_BEGIN_NAMESPACE
+namespace details {
+template <typename TyGroup, typename TyVal, typename TyOp>
+_CG_QUALIFIER auto inclusive_scan_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    auto out = val;
+    for (int mask = 1; mask < group.size(); mask <<= 1) {
+        auto tmp = group.shfl_up(out, mask);
+        if (mask <= group.thread_rank()) {
+            out = op(out, tmp);
+        }
+    }
+    return out;
+}
+template <typename TyGroup, typename TyVal, typename TyOp>
+_CG_QUALIFIER auto inclusive_scan_non_contiguous(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    const unsigned int groupSize = group.size();
+    auto out = val;
+    const unsigned int mask = details::_coalesced_group_data_access::get_mask(group);
+    unsigned int lanemask = details::lanemask32_lt() & mask;
+    unsigned int srcLane = details::laneid();
+    const unsigned int base = __ffs(mask)-1; /* lane with rank == 0 */
+    const unsigned int rank = __popc(lanemask);
+    for (unsigned int i = 1, j = 1; i < groupSize; i <<= 1) {
+        if (i <= rank) {
+            srcLane -= j;
+            j = i; /* maximum possible lane */
+            unsigned int begLane = base + rank - i; /* minimum possible lane */
+            /*  Next source lane is in the range [ begLane .. srcLane ]
+                *  If begLane < srcLane then do a binary search.
+                */
+            while (begLane < srcLane) {
+                const unsigned int halfLane = (begLane + srcLane) >> 1;
+                const unsigned int halfMask = lanemask >> halfLane;
+                const unsigned int d = __popc(halfMask);
+                if (d < i) {
+                    srcLane = halfLane - 1; /* halfLane too large */
+                }
+                else if ((i < d) || !(halfMask & 0x01)) {
+                    begLane = halfLane + 1; /* halfLane too small */
+                }
+                else {
+                    begLane = srcLane = halfLane; /* happen to hit */
+                }
+            }
+        }
+        auto tmp = details::tile::shuffle_dispatch<TyVal>::shfl(out, mask, srcLane, 32);
+        if (i <= rank) {
+            out = op(out, tmp);
+        }
+    }
+    return out;
+}
+template <unsigned int TySize, typename ParentT, typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_inclusive_scan(const __single_warp_thread_block_tile<TySize, ParentT>& group,
+                                            TyVal&& val,
+                                            TyOp&& op) -> decltype(op(val, val)) {
+    return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+}
+template <typename TyVal, typename TyOp>
+_CG_QUALIFIER auto coalesced_inclusive_scan(const coalesced_group& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+    if (group.size() == 32) {
+        return inclusive_scan_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+    else {
+        return inclusive_scan_non_contiguous(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+}
+template <bool IntegralOptimized>
+struct scan_choose_convertion;
+template<>
+struct scan_choose_convertion<true> {
+    template <typename TyGroup, typename TyRes, typename TyVal>
+    _CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) {
+        return result - val;
+    }
+};
+template<>
+struct scan_choose_convertion<false> {
+    template <typename TyGroup, typename TyRes, typename TyVal>
+    _CG_STATIC_QUALIFIER details::remove_qual<TyVal> convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val) {
+        auto ret = group.shfl_up(result, 1);
+        if (group.thread_rank() == 0) {
+            return {};
+        }
+        else {
+            return ret;
+        }
+    }
+};
+template <typename TyGroup, typename TyRes, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto convert_inclusive_to_exclusive(const TyGroup& group, TyRes& result, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    using conversion = scan_choose_convertion<_CG_STL_NAMESPACE::is_same<remove_qual<TyFn>, cooperative_groups::plus<remove_qual<TyVal>>>::value
+                                 && _CG_STL_NAMESPACE::is_integral<remove_qual<TyVal>>::value>;
+    return conversion::convert_inclusive_to_exclusive(group, result, _CG_STL_NAMESPACE::forward<TyVal>(val));
+}
+} // details
+_CG_END_NAMESPACE
+#endif // _CG_COALESCED_SCAN_H_

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/driver_abi.h ADDED Viewed

	@@ -0,0 +1,99 @@

+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _CG_DRIVER_API_H
+#define _CG_DRIVER_API_H
+#include "info.h"
+_CG_BEGIN_NAMESPACE
+namespace details {
+    template <unsigned int RegId>
+    _CG_QUALIFIER unsigned int load_env_reg() {
+        // Abort by default
+        _CG_ABORT();
+        return 0;
+    }
+    template <unsigned int HiReg, unsigned int LoReg>
+    _CG_QUALIFIER unsigned long long load_env_reg64() {
+        unsigned long long registerLo = load_env_reg<LoReg>();
+        unsigned long long registerHi = load_env_reg<HiReg>();
+        return (registerHi << 32) | registerLo;
+    }
+// inline PTX for accessing registers requires an immediate for the special reg
+# define LOAD_ENVREG(NUMBER) \
+    template <> _CG_QUALIFIER unsigned int load_env_reg<NUMBER>() { \
+        unsigned int r; \
+        asm ("mov.u32 %0, %%envreg" #NUMBER ";" : "=r"(r)); \
+        return r; \
+    }
+    // Instantiate loaders for registers used
+    LOAD_ENVREG(0);
+    LOAD_ENVREG(1);
+    LOAD_ENVREG(2);
+# undef LOAD_ENVREG
+    struct grid_workspace {
+        unsigned int wsSize;
+        unsigned int barrier;
+    };
+    _CG_QUALIFIER grid_workspace* get_grid_workspace() {
+        unsigned long long gridWsAbiAddress = load_env_reg64<1, 2>();
+        // Interpret the address from envreg 1 and 2 as the driver's grid workspace
+        return (reinterpret_cast<grid_workspace*>(gridWsAbiAddress));
+    }
+}
+_CG_END_NAMESPACE
+#endif // _CG_DRIVER_API_H

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/info.h ADDED Viewed

	@@ -0,0 +1,344 @@

+ /* Copyright 1993-2021 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _CG_INFO_H_
+#define _CG_INFO_H_
+/*
+** Define: _CG_VERSION
+*/
+#define _CG_VERSION 1000
+/*
+** Define: _CG_ABI_VERSION
+*/
+#ifndef _CG_ABI_VERSION
+# define _CG_ABI_VERSION 1
+#endif
+/*
+** Define: _CG_ABI_EXPERIMENTAL
+** Desc: If enabled, sets all features enabled (ABI-breaking or experimental)
+*/
+#if defined(_CG_ABI_EXPERIMENTAL)
+#endif
+#define _CG_CONCAT_INNER(x, y) x ## y
+#define _CG_CONCAT_OUTER(x, y) _CG_CONCAT_INNER(x, y)
+#define _CG_NAMESPACE _CG_CONCAT_OUTER(__v, _CG_ABI_VERSION)
+#define _CG_BEGIN_NAMESPACE \
+    namespace cooperative_groups { namespace _CG_NAMESPACE {
+#define _CG_END_NAMESPACE \
+    }; using namespace _CG_NAMESPACE; };
+#if (defined(__cplusplus) && (__cplusplus >= 201103L)) || (defined(_MSC_VER) && (_MSC_VER >= 1900))
+# define _CG_CPP11_FEATURES
+#endif
+#if !defined(_CG_QUALIFIER)
+# define _CG_QUALIFIER __forceinline__ __device__
+#endif
+#if !defined(_CG_STATIC_QUALIFIER)
+# define _CG_STATIC_QUALIFIER static __forceinline__ __device__
+#endif
+#if !defined(_CG_CONSTEXPR_QUALIFIER)
+# if defined(_CG_CPP11_FEATURES)
+#  define _CG_CONSTEXPR_QUALIFIER constexpr __forceinline__ __device__
+# else
+#  define _CG_CONSTEXPR_QUALIFIER _CG_QUALIFIER
+# endif
+#endif
+#if !defined(_CG_STATIC_CONSTEXPR_QUALIFIER)
+# if defined(_CG_CPP11_FEATURES)
+#  define _CG_STATIC_CONSTEXPR_QUALIFIER static constexpr __forceinline__ __device__
+# else
+#  define _CG_STATIC_CONSTEXPR_QUALIFIER _CG_STATIC_QUALIFIER
+# endif
+#endif
+#if defined(_MSC_VER)
+# define _CG_DEPRECATED __declspec(deprecated)
+#else
+# define _CG_DEPRECATED __attribute__((deprecated))
+#endif
+#if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_GRID_GROUP
+#endif
+#if (__CUDA_ARCH__ >= 600) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_MULTI_GRID_GROUP
+#endif
+#if (__CUDA_ARCH__ >= 700) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_MATCH_COLLECTIVE
+#endif
+#if (__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__) && (defined(__NVCC__) || defined(__CUDACC_RTC__))
+# define _CG_HAS_OP_REDUX
+#endif
+#if ((__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__)) && !defined(_CG_USER_PROVIDED_SHARED_MEMORY)
+# define _CG_HAS_RESERVED_SHARED
+#endif
+#if ((__CUDA_ARCH__ >= 900) || !defined(__CUDA_ARCH__)) && \
+    (defined(__NVCC__) || defined(__CUDACC_RTC__) || defined(_CG_CLUSTER_INTRINSICS_AVAILABLE)) && \
+    defined(_CG_CPP11_FEATURES)
+# define _CG_HAS_CLUSTER_GROUP
+#endif
+#if (__CUDA_ARCH__ >= 900) || !defined(__CUDA_ARCH__)
+# define _CG_HAS_INSTR_ELECT
+#endif
+// Has __half and __half2
+// Only usable if you include the cuda_fp16.h extension, and
+// _before_ including cooperative_groups.h
+#ifdef __CUDA_FP16_TYPES_EXIST__
+# define _CG_HAS_FP16_COLLECTIVE
+#endif
+// Include libcu++ where supported.
+#if defined(_CG_CPP11_FEATURES) && !defined(__QNX__) && !defined(__ibmxl__) && \
+    (defined(__NVCC__) || defined(__CUDACC_RTC__)) && \
+    (defined(__x86_64__) || defined(__aarch64__) || defined(__ppc64__)|| defined(_M_X64) || defined(_M_ARM64)) && \
+    (defined(_MSC_VER) || defined(__GNUC__) || defined(__clang__))
+# define _CG_USE_CUDA_STL
+#else
+# define _CG_USE_OWN_TRAITS
+#endif
+#if defined(_CG_USE_CUDA_STL) && (!defined(__CUDA_ARCH__) || \
+    ((!defined(_MSC_VER) && __CUDA_ARCH__ >= 600) || (defined(_MSC_VER) && __CUDA_ARCH__ >= 700)))
+# define _CG_HAS_STL_ATOMICS
+#endif
+#ifdef _CG_CPP11_FEATURES
+// Use cuda::std:: for type_traits
+# if defined(_CG_USE_CUDA_STL)
+#  define _CG_STL_NAMESPACE cuda::std
+#  include <cuda/std/type_traits>
+// Use CG's implementation of type traits
+# else
+#  define _CG_STL_NAMESPACE cooperative_groups::details::templates
+# endif
+#endif
+#ifdef _CG_CPP11_FEATURES
+# define _CG_STATIC_CONST_DECL static constexpr
+# define _CG_CONST_DECL constexpr
+#else
+# define _CG_STATIC_CONST_DECL static const
+# define _CG_CONST_DECL const
+#endif
+#if (defined(_MSC_VER) && !defined(_WIN64)) || defined(__arm__)
+# define _CG_ASM_PTR_CONSTRAINT "r"
+#else
+#  define _CG_ASM_PTR_CONSTRAINT "l"
+#endif
+/*
+** Define: CG_DEBUG
+** What: Enables various runtime safety checks
+*/
+#if defined(__CUDACC_DEBUG__) && defined(CG_DEBUG) && !defined(NDEBUG)
+# define _CG_DEBUG
+#endif
+#if defined(_CG_DEBUG)
+# include <assert.h>
+# define _CG_ASSERT(x) assert((x));
+# define _CG_ABORT() assert(0);
+#else
+# define _CG_ASSERT(x)
+# define _CG_ABORT() __trap();
+#endif
+_CG_BEGIN_NAMESPACE
+namespace details {
+    _CG_STATIC_CONST_DECL unsigned int default_max_block_size = 1024;
+#if defined(_CG_CPP11_FEATURES) && !defined(_CG_USE_CUDA_STL)
+namespace templates {
+/**
+ * Integral constants
+ **/
+template <typename Ty, Ty Val>
+struct integral_constant {
+    static constexpr Ty value = Val;
+    typedef Ty type;
+    _CG_QUALIFIER constexpr operator type() const noexcept { return value; }
+    _CG_QUALIFIER constexpr type operator()() const noexcept { return value; }
+};
+typedef integral_constant<bool, true>  true_type;
+typedef integral_constant<bool, false> false_type;
+/**
+ * CV Qualifiers
+ **/
+template <class Ty> struct is_lvalue_reference       : public details::templates::false_type {};
+template <class Ty> struct is_lvalue_reference<Ty&>  : public details::templates::true_type {};
+template <class Ty> struct remove_reference       {typedef Ty type;};
+template <class Ty> struct remove_reference<Ty&>  {typedef Ty type;};
+template <class Ty> struct remove_reference<Ty&&> {typedef Ty type;};
+template <class Ty>
+using remove_reference_t = typename details::templates::remove_reference<Ty>::type;
+template <class Ty> struct remove_const           {typedef Ty type;};
+template <class Ty> struct remove_const<const Ty> {typedef Ty type;};
+template <class Ty> struct remove_volatile              {typedef Ty type;};
+template <class Ty> struct remove_volatile<volatile Ty> {typedef Ty type;};
+template <class Ty> struct remove_cv {typedef typename details::templates::remove_volatile<typename details::templates::remove_const<Ty>::type>::type type;};
+template <class Ty>
+using remove_cv_t = typename details::templates::remove_cv<Ty>::type;
+template <class Ty>
+_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &t) noexcept {
+    return static_cast<Ty&&>(t);
+}
+template <class Ty>
+_CG_QUALIFIER Ty&& forward(remove_reference_t<Ty> &&t) noexcept {
+    static_assert(!details::templates::is_lvalue_reference<Ty>::value, "Forwarding an rvalue as an lvalue is not allowed.");
+    return static_cast<Ty&&>(t);
+}
+/**
+ * is_integral
+ **/
+template <class Ty> struct _is_integral                     : public details::templates::false_type {};
+template <>         struct _is_integral<bool>               : public details::templates::true_type {};
+template <>         struct _is_integral<char>               : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned char>      : public details::templates::true_type {};
+template <>         struct _is_integral<short>              : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned short>     : public details::templates::true_type {};
+template <>         struct _is_integral<int>                : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned int>       : public details::templates::true_type {};
+template <>         struct _is_integral<long>               : public details::templates::true_type {};
+template <>         struct _is_integral<long long>          : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned long>      : public details::templates::true_type {};
+template <>         struct _is_integral<unsigned long long> : public details::templates::true_type {};
+//Vector type support?
+template <typename Ty>
+struct is_integral : public details::templates::_is_integral<typename details::templates::remove_cv<Ty>::type> {};
+/**
+ * is_floating_point
+ **/
+template <class Ty> struct _is_floating_point              : public details::templates::false_type {};
+template <>         struct _is_floating_point<float>       : public details::templates::true_type {};
+template <>         struct _is_floating_point<double>      : public details::templates::true_type {};
+template <>         struct _is_floating_point<long double> : public details::templates::true_type {};
+# ifdef __CUDA_FP16_TYPES_EXIST__
+template <>         struct _is_floating_point<__half>      : public details::templates::true_type {};
+template <>         struct _is_floating_point<__half2>     : public details::templates::true_type {};
+# endif
+//Vector type support?
+template <typename Ty>
+struct is_floating_point : public details::templates::_is_floating_point<typename details::templates::remove_cv<Ty>::type> {};
+template <class T>
+struct is_arithmetic : details::templates::integral_constant<
+    bool,
+    details::templates::is_integral<T>::value ||
+    details::templates::is_floating_point<T>::value> {};
+template <typename Ty, bool = details::templates::is_arithmetic<Ty>::value>
+struct _is_unsigned : details::templates::integral_constant<bool, Ty(0) < Ty(-1)> {};
+template <typename Ty>
+struct _is_unsigned<Ty,false> : details::templates::false_type {};
+template <typename Ty>
+struct is_unsigned : _is_unsigned<typename details::templates::remove_cv<Ty>::type> {};
+template <typename Ty> struct _is_pointer      : public details::templates::false_type {};
+template <typename Ty> struct _is_pointer<Ty*> : public details::templates::true_type {};
+template <typename Ty>
+struct is_pointer : _is_pointer<typename details::templates::remove_cv<Ty>::type> {};
+/**
+ * programmatic type traits
+ **/
+template<bool B, class Ty = void>
+struct enable_if {};
+template<class Ty>
+struct enable_if<true, Ty> { typedef Ty type; };
+template<bool Cond, typename Ty = void>
+using enable_if_t = typename details::templates::enable_if<Cond, Ty>::type;
+template<class Ty1, class Ty2>
+struct is_same : details::templates::false_type {};
+template<class Ty>
+struct is_same<Ty, Ty> : details::templates::true_type {};
+} // templates
+#endif // _CG_CPP11_FEATURES
+} // details
+_CG_END_NAMESPACE
+#endif // _CG_INFO_H_

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/invoke.h ADDED Viewed

	@@ -0,0 +1,189 @@

+/*
+ * Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CG_INVOKE_H
+#define _CG_INVOKE_H
+#include "info.h"
+#include "helpers.h"
+#if defined(_CG_CPP11_FEATURES)
+_CG_BEGIN_NAMESPACE
+namespace details {
+    template <typename Group>
+    struct _elect_group_supported : _CG_STL_NAMESPACE::false_type {};
+#ifdef _CG_HAS_INSTR_ELECT
+    template<>
+    struct _elect_group_supported<coalesced_group> : _CG_STL_NAMESPACE::true_type {};
+    template<unsigned int Size, typename Parent>
+    struct _elect_group_supported<thread_block_tile<Size, Parent>> :
+        _CG_STL_NAMESPACE::integral_constant<bool, (Size <= 32)> {};
+#endif
+    template <typename Group>
+    struct elect_group_supported : public _elect_group_supported<details::remove_qual<Group>> {};
+    template<typename Group>
+    _CG_STATIC_QUALIFIER bool elect_one(const Group& group, unsigned int mask, unsigned int& leader_lane) {
+        int is_leader = 0;
+#ifdef _CG_HAS_INSTR_ELECT
+        asm("{\n\t"
+          " .reg .pred p;\n\t"
+          "  elect.sync %0|p, %2;\n\t"
+          " @p mov.s32 %1, 1;\n\t"
+          "}"
+          : "+r"(leader_lane), "+r"(is_leader) : "r" (mask));
+#endif
+        return is_leader;
+    }
+    template<bool UseElect>
+    struct invoke_one_impl {};
+    template<>
+    struct invoke_one_impl<true> {
+        template<typename Group, typename Fn, typename... Args>
+        _CG_STATIC_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
+            auto mask = details::_coalesced_group_data_access::get_mask(group);
+            unsigned int leader_lane = 0;
+            if (elect_one(group, mask, leader_lane)) {
+                _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
+            }
+        }
+        template<typename Group, typename Fn, typename... Args>
+        _CG_STATIC_QUALIFIER auto invoke_one_broadcast(const Group& group, Fn&& fn, Args&&... args)
+                -> typename _CG_STL_NAMESPACE::remove_reference<
+                    decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
+            using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
+            details::remove_qual<ResultType> result;
+            auto mask = details::_coalesced_group_data_access::get_mask(group);
+            unsigned int leader_lane = 0;
+            if (elect_one(group, mask, leader_lane)) {
+                result = _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
+            }
+            // Need to use low level api instead of group.shfl, because elect_one returns lane id, not group rank.
+            return tile::shuffle_dispatch<ResultType>::shfl(result, mask, leader_lane, 32);
+        }
+    };
+    template<>
+    struct invoke_one_impl<false> {
+        template<typename Group, typename Fn, typename... Args>
+        _CG_STATIC_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
+            if (group.thread_rank() == 0) {
+                _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
+            }
+        }
+        template<typename Group, typename Fn, typename... Args>
+        _CG_STATIC_QUALIFIER auto invoke_one_broadcast(const Group& group, Fn&& fn, Args&&... args)
+                -> typename _CG_STL_NAMESPACE::remove_reference<
+                    decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
+            using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
+            details::remove_qual<ResultType> result;
+            if (group.thread_rank() == 0) {
+                result = _CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...);
+            }
+            return group.shfl(result, 0);
+        }
+    };
+}; // namespace details
+template<typename Group, typename Fn, typename... Args>
+_CG_QUALIFIER void invoke_one(const Group& group, Fn&& fn, Args&&... args) {
+    using impl = details::invoke_one_impl<details::elect_group_supported<Group>::value>;
+    impl::invoke_one(group, _CG_STL_NAMESPACE::forward<Fn>(fn), _CG_STL_NAMESPACE::forward<Args>(args)...);
+}
+template<typename Fn, typename... Args>
+_CG_QUALIFIER auto invoke_one_broadcast(const coalesced_group& group, Fn&& fn, Args&&... args)
+        -> typename _CG_STL_NAMESPACE::remove_reference<
+            decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
+    using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
+    static_assert(!_CG_STL_NAMESPACE::is_same<ResultType, void>::value,
+                  "For invocables returning void invoke_one should be used instead");
+    using impl = details::invoke_one_impl<details::elect_group_supported<coalesced_group>::value>;
+    return impl::invoke_one_broadcast(group,
+                                      _CG_STL_NAMESPACE::forward<Fn>(fn),
+                                      _CG_STL_NAMESPACE::forward<Args>(args)...);
+}
+template<unsigned int Size, typename Parent, typename Fn, typename... Args>
+_CG_QUALIFIER auto invoke_one_broadcast(const thread_block_tile<Size, Parent>& group, Fn&& fn, Args&&... args)
+        -> typename _CG_STL_NAMESPACE::remove_reference<
+            decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...))>::type {
+    using ResultType = decltype(_CG_STL_NAMESPACE::forward<Fn>(fn)(_CG_STL_NAMESPACE::forward<Args>(args)...));
+    static_assert(!_CG_STL_NAMESPACE::is_same<ResultType, void>::value,
+                  "For invocables returning void invoke_one should be used instead");
+    using impl = details::invoke_one_impl<details::elect_group_supported<thread_block_tile<Size, Parent>>::value>;
+    return impl::invoke_one_broadcast(group,
+                                      _CG_STL_NAMESPACE::forward<Fn>(fn),
+                                      _CG_STL_NAMESPACE::forward<Args>(args)...);
+}
+_CG_END_NAMESPACE
+#endif //_CG_CPP11_FEATURES
+#endif // _CG_INVOKE_H

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/memory.h ADDED Viewed

	@@ -0,0 +1,135 @@

+/* Copyright 1993-2022 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _COOPERATIVE_GROUPS_MEMORY_H_
+# define _COOPERATIVE_GROUPS_MEMORY_H_
+#include "info.h"
+_CG_BEGIN_NAMESPACE
+#if defined(_CG_CPP11_FEATURES)
+namespace details {
+    _CG_STATIC_CONST_DECL int scratch_num_reserved_bytes = 12;
+#if defined(_CG_HAS_RESERVED_SHARED)
+    _CG_STATIC_QUALIFIER void* reserved_shared_ptr()
+    {
+        void *ptr;
+        asm ("{\n\t"
+             " .reg .u32 start;\n\t"
+             " .reg .u64 extended;\n\t"
+             " mov.u32 start, %%reserved_smem_offset_1;\n\t"
+             " cvt.u64.u32 extended, start;\n\t"
+             " cvta.shared.u64 %0, extended;\n\t"
+             "}"
+             : "=" _CG_ASM_PTR_CONSTRAINT(ptr));
+        return ptr;
+    }
+#endif
+    struct multi_warp_scratch {
+        // One barrier per possible size of the group.
+        _CG_STATIC_CONST_DECL unsigned int memory_barriers_count = 5;
+        _CG_STATIC_CONST_DECL size_t sync_memory_size = memory_barriers_count * sizeof(barrier_t);
+        using communication_type = unsigned long long;
+        _CG_STATIC_CONST_DECL size_t communication_size = sizeof(communication_type);
+        // Layout of the scratch space:
+        barrier_t barriers[memory_barriers_count];
+        char reserved[scratch_num_reserved_bytes]; // Reserve 12 bytes for future use
+        communication_type communication_memory[default_max_block_size / 32];
+        _CG_STATIC_CONSTEXPR_QUALIFIER unsigned int scratch_size_needed(unsigned int max_block_size) {
+            // One slot of collectives memory per warp.
+            return scratch_num_reserved_bytes + sync_memory_size + max_block_size / 32 * communication_size;
+        }
+        _CG_QUALIFIER void init_barriers(unsigned int thread_rank) {
+            if (thread_rank < memory_barriers_count) {
+                barriers[thread_rank] = 0;
+            }
+        }
+    };
+#if defined(_CG_HAS_RESERVED_SHARED)
+    // CG can expect at least 288 bytes available in reserved shared
+    static_assert(sizeof(multi_warp_scratch) <= 288, "multi-warp scratch size is too large");
+#endif
+    // Make sure the structure can fit into the user provided memory
+    static_assert(sizeof(multi_warp_scratch) <= multi_warp_scratch::scratch_size_needed(default_max_block_size),
+                  "multi-warp scratch size is too large");
+    _CG_QUALIFIER multi_warp_scratch* get_scratch_ptr(void* user_scratch) {
+        void *ptr;
+#if defined(_CG_HAS_RESERVED_SHARED)
+        ptr = reserved_shared_ptr();
+#else
+        ptr = user_scratch;
+#endif
+        return static_cast<multi_warp_scratch*>(ptr);
+    }
+}
+template <unsigned int MaxBlockSize = details::default_max_block_size>
+struct __align__(details::multi_warp_scratch::communication_size) block_tile_memory {
+private:
+#if !defined(_CG_HAS_RESERVED_SHARED)
+    char scratch[details::multi_warp_scratch::scratch_size_needed(MaxBlockSize)];
+#endif
+};
+#endif
+_CG_END_NAMESPACE
+#endif /* !_COOPERATIVE_GROUPS_MEMORY_H_ */

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/partitioning.h ADDED Viewed

	@@ -0,0 +1,159 @@

+/*
+ * Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CG_PARTITIONING_H
+#define _CG_PARTITIONING_H
+#include "info.h"
+#include "helpers.h"
+_CG_BEGIN_NAMESPACE
+namespace details {
+    template <typename TyGroup>
+    _CG_STATIC_QUALIFIER coalesced_group _binary_partition(const TyGroup &tile, bool pred) {
+        const unsigned int fullMask = ~0u;
+        unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
+        unsigned int predMask = pred ? 0 : fullMask;
+        unsigned int setMask = __ballot_sync(thisMask, pred);
+        if (setMask == thisMask || setMask == 0) {
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(thisMask);
+            _coalesced_group_data_access::modify_meta_group(subTile, 0, 1);
+            return subTile;
+        }
+        else {
+            unsigned int subMask = thisMask & (setMask ^ predMask);
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
+            _coalesced_group_data_access::modify_meta_group(subTile, pred, 2);
+            return subTile;
+        }
+    }
+#if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
+    template <typename TyPredicate>
+    struct _labeled_partition_dispatch {
+        template <typename TyGroup>
+        _CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, TyPredicate pred) {
+            unsigned int thisMask = _coalesced_group_data_access::get_mask(tile);
+            unsigned int thisBias = __ffs(thisMask) - 1; // Subtract 1 to index properly from [1-32]
+            unsigned int subMask = __match_any_sync(thisMask, pred);
+            coalesced_group subTile = _coalesced_group_data_access::construct_from_mask<coalesced_group>(subMask);
+            int leaderLaneId = subTile.shfl(details::laneid(), 0);
+            bool isLeader = !subTile.thread_rank();
+            unsigned int leaderMask = __ballot_sync(thisMask, isLeader);
+            unsigned int tileRank = __fns(leaderMask, leaderLaneId, 0) - thisBias;
+            _coalesced_group_data_access::modify_meta_group(subTile, tileRank, __popc(leaderMask));
+            return subTile;
+        }
+    };
+    template <>
+    struct _labeled_partition_dispatch<bool> {
+        template <typename TyGroup>
+        _CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, bool pred) {
+            return _binary_partition(tile, pred);
+        }
+    };
+    template <typename TyPredicate>
+    struct _labeled_partition_dispatch<TyPredicate*> {
+        template <typename TyGroup>
+        _CG_QUALIFIER coalesced_group operator()(const TyGroup &tile, TyPredicate* pred) {
+            auto impl = _labeled_partition_dispatch<unsigned long long>();
+            return impl(tile, reinterpret_cast<unsigned long long>(pred));
+        }
+    };
+#endif
+}; // namespace details
+_CG_STATIC_QUALIFIER coalesced_group binary_partition(const coalesced_group &tile, bool pred) {
+    return details::_binary_partition(tile, pred);
+}
+template <unsigned int Size, typename ParentT>
+_CG_STATIC_QUALIFIER coalesced_group binary_partition(const thread_block_tile<Size, ParentT> &tile, bool pred) {
+#ifdef _CG_CPP11_FEATURES
+    static_assert(Size <= 32, "Binary partition is available only for tiles of size smaller or equal to 32");
+#endif
+    return details::_binary_partition(tile, pred);
+}
+#if defined(_CG_HAS_MATCH_COLLECTIVE) && defined(_CG_CPP11_FEATURES)
+template <typename TyPredicate>
+_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const coalesced_group &tile, TyPredicate pred) {
+    static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value ||
+                  _CG_STL_NAMESPACE::is_pointer<TyPredicate>::value,
+                  "labeled_partition predicate must be an integral or pointer type");
+    auto dispatch = details::_labeled_partition_dispatch<details::remove_qual<TyPredicate>>();
+    return dispatch(tile, pred);
+}
+template <typename TyPredicate, unsigned int Size, typename ParentT>
+_CG_STATIC_QUALIFIER coalesced_group labeled_partition(const thread_block_tile<Size, ParentT> &tile, TyPredicate pred) {
+    static_assert(_CG_STL_NAMESPACE::is_integral<TyPredicate>::value ||
+                  _CG_STL_NAMESPACE::is_pointer<TyPredicate>::value,
+                  "labeled_partition predicate must be an integral or pointer type");
+    static_assert(Size <= 32, "Labeled partition is available only for tiles of size smaller or equal to 32");
+    auto dispatch = details::_labeled_partition_dispatch<details::remove_qual<TyPredicate>>();
+    return dispatch(tile, pred);
+}
+#endif
+_CG_END_NAMESPACE
+#endif // _CG_PARTITIONING_H

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/reduce.h ADDED Viewed

	@@ -0,0 +1,419 @@

+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _CG_REDUCE_H_
+#define _CG_REDUCE_H_
+#include "info.h"
+#include "helpers.h"
+#include "coalesced_reduce.h"
+#include "functional.h"
+#include "cooperative_groups.h"
+_CG_BEGIN_NAMESPACE
+namespace details {
+    template <class Ty>
+    using _redux_is_add_supported = _CG_STL_NAMESPACE::integral_constant<
+            bool,
+            _CG_STL_NAMESPACE::is_integral<Ty>::value && (sizeof(Ty) <= 4)>;
+    template <class Ty>
+    using redux_is_add_supported = _redux_is_add_supported<Ty>;
+    // A specialization for 64 bit logical operations is possible
+    // but for now only accelerate 32 bit bitwise ops
+    template <class Ty>
+    using redux_is_logical_supported = redux_is_add_supported<Ty>;
+    // Base operator support case
+    template <class TyOp, class Ty> struct _redux_op_supported                 : public _CG_STL_NAMESPACE::false_type {};
+#ifdef _CG_HAS_OP_REDUX
+    template <class Ty> struct _redux_op_supported<cooperative_groups::plus<Ty>,    Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::less<Ty>,    Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::greater<Ty>, Ty> : public redux_is_add_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_and<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_or<Ty>,  Ty> : public redux_is_logical_supported<Ty> {};
+    template <class Ty> struct _redux_op_supported<cooperative_groups::bit_xor<Ty>, Ty> : public redux_is_logical_supported<Ty> {};
+#endif
+    template <class Ty, template <class> class TyOp>
+    using redux_op_supported = _redux_op_supported<
+            typename details::remove_qual<TyOp<Ty>>,
+            Ty>;
+    // Groups smaller than 16 actually have worse performance characteristics when used with redux
+    // tiles of size 16 and 32 perform the same or better and have better code generation profiles
+    template <class TyGroup> struct _redux_group_optimized : public _CG_STL_NAMESPACE::false_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _redux_group_optimized<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::integral_constant<
+                                                                                            bool,
+                                                                                            (Sz >= 16)> {};
+    template <unsigned int Sz, typename TyPar>
+    struct _redux_group_optimized<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::integral_constant<
+                                                                                            bool,
+                                                                                            (Sz >= 16)> {};
+    template <>
+    struct _redux_group_optimized<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type  {};
+    template <typename TyGroup>
+    using redux_group_optimized = _redux_group_optimized<details::remove_qual<TyGroup>>;
+    template <template <class> class TyOp>
+    _CG_STATIC_QUALIFIER int pick_redux(int mask, int val);
+    template <template <class> class TyOp>
+    _CG_STATIC_QUALIFIER unsigned int pick_redux(int mask, unsigned int val);
+#ifdef _CG_HAS_OP_REDUX
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::plus>(int mask, int val) {
+        return __reduce_add_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::less>(int mask, int val) {
+        return __reduce_min_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::greater>(int mask, int val) {
+        return __reduce_max_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_and>(int mask, int val) {
+        return __reduce_and_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_xor>(int mask, int val) {
+        return __reduce_xor_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER int pick_redux<cooperative_groups::bit_or>(int mask, int val) {
+        return __reduce_or_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::plus>(int mask, unsigned int val) {
+        return __reduce_add_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::less>(int mask, unsigned int val) {
+        return __reduce_min_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::greater>(int mask, unsigned int val) {
+        return __reduce_max_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_and>(int mask, unsigned int val) {
+        return __reduce_and_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_xor>(int mask, unsigned int val) {
+        return __reduce_xor_sync(mask, val);
+    }
+    template <> _CG_QUALIFIER unsigned int pick_redux<cooperative_groups::bit_or>(int mask, unsigned int val) {
+        return __reduce_or_sync(mask, val);
+    }
+#endif
+    template <typename TyVal, bool = _CG_STL_NAMESPACE::is_unsigned<TyVal>::value>
+    struct _accelerated_op;
+    // Signed type redux intrinsic dispatch
+    template <typename TyVal>
+    struct _accelerated_op<TyVal, false> {
+        template <template <class> class TyOp>
+        _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
+            return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<int>(val)));
+        }
+    };
+    // Unsigned type redux intrinsic dispatch
+    template <typename TyVal>
+    struct _accelerated_op<TyVal, true> {
+        template <template <class> class TyOp>
+        _CG_STATIC_QUALIFIER TyVal redux(int mask, TyVal val) {
+            return static_cast<TyVal>(pick_redux<TyOp>(mask, static_cast<unsigned int>(val)));
+        }
+    };
+    template <typename TyVal>
+    using accelerated_op = _accelerated_op<TyVal>;
+    template <typename TyVal, typename TyFnInput, typename TyGroup>
+    class _redux_dispatch {
+        template <class Ty, template <class> class TyOp>
+        using _redux_is_usable = _CG_STL_NAMESPACE::integral_constant<bool,
+            redux_op_supported<Ty, TyOp>::value &&
+            redux_group_optimized<TyGroup>::value>;
+        template <class Ty, template <class> class TyOp>
+        using redux_is_usable = typename _CG_STL_NAMESPACE::enable_if<_redux_is_usable<Ty, TyOp>::value, void>::type*;
+        template <class Ty, template <class> class TyOp>
+        using redux_is_not_usable = typename _CG_STL_NAMESPACE::enable_if<!_redux_is_usable<Ty, TyOp>::value, void>::type*;
+    public:
+        // Dispatch to redux if the combination of op and args are supported
+        template<
+            template <class> class TyOp,
+            redux_is_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+            // Retrieve the mask for the group and dispatch to redux
+            return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
+        }
+        template<
+            template <class> class TyOp,
+            redux_is_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
+            // Retrieve the mask for the group and dispatch to redux
+            return accelerated_op<TyFnInput>::template redux<TyOp>(_coalesced_group_data_access::get_mask(group), _CG_STL_NAMESPACE::forward<TyVal>(val));
+        }
+        // Fallback shuffle sync reduction
+        template <
+            template <class> class TyOp,
+            redux_is_not_usable<TyFnInput, TyOp> = nullptr>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+            //Dispatch to fallback shuffle sync accelerated reduction
+            return coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+        }
+    };
+    // Group support for reduce.
+    template <class TyGroup> struct _reduce_group_supported : public _CG_STL_NAMESPACE::false_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _reduce_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _reduce_group_supported<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::true_type {};
+    template <>
+    struct _reduce_group_supported<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type {};
+    template <typename TyGroup>
+    using reduce_group_supported = _reduce_group_supported<details::remove_qual<TyGroup>>;
+    template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>&& op) -> decltype(op(val, val)) {
+        static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
+        using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
+        return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+    }
+    template <typename TyVal, typename TyFnInput, template <class> class TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp<TyFnInput>& op) -> decltype(op(val, val)) {
+        static_assert(details::is_op_type_same<TyFnInput, TyVal>::value, "Operator and argument types differ");
+        using dispatch = details::_redux_dispatch<TyVal, TyFnInput, TyGroup>;
+        return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp<TyFnInput>>(op));
+    }
+    template <typename TyVal, typename TyOp, typename TyGroup>
+    _CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyOp&& op) -> decltype(op(val, val)) {
+        return details::coalesced_reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyOp>(op));
+    }
+    template <unsigned int GroupId>
+    struct tile_reduce_dispatch;
+    template <>
+    struct tile_reduce_dispatch<details::coalesced_group_id> {
+        template <typename TyGroup, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            return details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+        }
+    };
+#if defined(_CG_CPP11_FEATURES)
+    template <>
+    struct tile_reduce_dispatch<details::multi_tile_group_id> {
+        template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto reduce(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                    *warp_scratch_location =
+                        details::reduce(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            };
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    *thread_scratch_location =
+                        details::reduce(subwarp, *thread_scratch_location, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            };
+            return details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+        }
+    };
+    template <unsigned int GroupId>
+    struct tile_async_reduce_dispatch;
+    template <>
+    struct tile_async_reduce_dispatch<details::coalesced_group_id> {
+        template <typename GroupT, typename TyDst, typename TyVal, typename TyFn, typename TyResHandler>
+        _CG_STATIC_QUALIFIER void reduce(const GroupT& group, TyDst& dst, TyVal&& val, TyFn&& op, TyResHandler& res_handler) {
+            // Do regular, in group reduction
+            auto result = details::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            // One thread stores/updates the destination
+            if (group.thread_rank() == 0) {
+                res_handler(result);
+            }
+        }
+    };
+    template <>
+    struct tile_async_reduce_dispatch<details::multi_tile_group_id> {
+        template <unsigned int TySize, typename ParentT, typename TyDst, typename TyInputVal, typename TyFn, typename TyResHandler>
+        _CG_STATIC_QUALIFIER void reduce(const thread_block_tile<TySize, ParentT>& group, TyDst& dst, TyInputVal&& val, TyFn&& op, TyResHandler& res_handler) {
+            using TyVal = remove_qual<TyInputVal>;
+            const unsigned int num_warps = TySize / 32;
+            details::barrier_t* sync_location = multi_warp_sync_location_getter(group);
+            auto warp_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, group.thread_rank() / 32);
+            // Do in warp reduce
+            auto warp = details::tiled_partition_internal<32, thread_block_tile<TySize, ParentT>>();
+            *warp_scratch_location = details::reduce(warp, _CG_STL_NAMESPACE::forward<TyInputVal>(val), op);
+            // Tile of size num_warps from the last warp to arrive does final reduction step
+            if (details::sync_warps_last_releases(sync_location, details::cta::thread_rank(), num_warps)) {
+                auto subwarp = details::tiled_partition_internal<num_warps, decltype(warp)>();
+                if (subwarp.meta_group_rank() == 0) {
+                    auto thread_scratch_location = multi_warp_scratch_location_getter<TyVal>(group, subwarp.thread_rank());
+                    auto thread_val = *thread_scratch_location;
+                    // Release other warps, we read their contribution already.
+                    subwarp.sync();
+                    details::sync_warps_release(sync_location, subwarp.thread_rank() == 0, details::cta::thread_rank(), num_warps);
+                    TyVal result = details::reduce(subwarp, thread_val, op);
+                    // One thread stores the result or updates the atomic
+                    if (subwarp.thread_rank() == 0) {
+                        res_handler(result);
+                    }
+                }
+                warp.sync();
+            }
+        }
+    };
+#endif
+    template <typename TyGroup, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_reduce_params() {
+        static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
+        static_assert(details::reduce_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
+    };
+    template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_async_reduce_params() {
+        check_reduce_params<TyGroup, TyInputVal, TyRetVal>();
+        static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
+    }
+} // details
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto reduce(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_reduce_params<TyGroup, details::remove_qual<TyVal>, decltype(op(val, val))>();
+    using dispatch = details::tile_reduce_dispatch<TyGroup::_group_id>;
+    return dispatch::reduce(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+#if defined(_CG_CPP11_FEATURES)
+# if defined(_CG_HAS_STL_ATOMICS)
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_update_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto update_lambda = [&] (TyVal& result) {
+        details::atomic_update(dst, result, op);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
+}
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_update_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto update_lambda = [&] (TyVal& result) {
+        details::atomic_update(dst, result, op);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), update_lambda);
+}
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_store_async(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto store_lambda = [&] (TyVal& result) {
+        details::atomic_store(dst, result);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
+}
+template<typename TyGroup, typename TyVal, cuda::thread_scope Sco, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_store_async(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto store_lambda = [&] (TyVal& result) {
+        details::atomic_store(dst, result);
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
+}
+# endif
+template<typename TyGroup, typename TyVal, typename TyInputVal, typename TyFn>
+void _CG_QUALIFIER reduce_store_async(const TyGroup& group, TyVal* dst, TyInputVal&& val, TyFn&& op) {
+    details::check_async_reduce_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    auto store_lambda = [&] (TyVal& result) {
+        *dst = result;
+    };
+    using dispatch = details::tile_async_reduce_dispatch<TyGroup::_group_id>;
+    dispatch::reduce(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op), store_lambda);
+}
+#endif
+_CG_END_NAMESPACE
+#endif // _CG_REDUCE_H_

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/scan.h ADDED Viewed

	@@ -0,0 +1,320 @@

+/* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _CG_SCAN_H_
+#define _CG_SCAN_H_
+#include "info.h"
+#include "helpers.h"
+#include "functional.h"
+#include "coalesced_scan.h"
+_CG_BEGIN_NAMESPACE
+namespace details {
+    // Group support for scan.
+    template <class TyGroup> struct _scan_group_supported : public _CG_STL_NAMESPACE::false_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _scan_group_supported<cooperative_groups::thread_block_tile<Sz, TyPar>> : public _CG_STL_NAMESPACE::true_type {};
+    template <unsigned int Sz, typename TyPar>
+    struct _scan_group_supported<internal_thread_block_tile<Sz, TyPar>>            : public _CG_STL_NAMESPACE::true_type {};
+    template <>
+    struct _scan_group_supported<cooperative_groups::coalesced_group>              : public _CG_STL_NAMESPACE::true_type {};
+    template <typename TyGroup>
+    using scan_group_supported = _scan_group_supported<details::remove_qual<TyGroup>>;
+    template <bool IsIntegralPlus>
+    struct integral_optimized_scan;
+    enum class ScanType { exclusive, inclusive };
+    template <unsigned int GroupId,  ScanType TyScan>
+    struct scan_dispatch;
+    template <ScanType TyScan>
+    struct scan_dispatch<details::coalesced_group_id, TyScan> {
+        template <typename TyGroup, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            auto scan_result = coalesced_inclusive_scan(group, val, op);
+            if (TyScan == ScanType::exclusive) {
+                scan_result = convert_inclusive_to_exclusive(group,
+                                                             scan_result,
+                                                             _CG_STL_NAMESPACE::forward<TyVal>(val),
+                                                             _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+            return scan_result;
+        }
+    };
+#if defined(_CG_CPP11_FEATURES)
+    template <ScanType TyScan>
+    struct scan_dispatch<details::multi_tile_group_id, TyScan> {
+        template <unsigned int Size, typename ParentT, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            // In warp scan result, calculated in warp_lambda
+            TyRet warp_scan;
+            // In warp scan, put sum in the warp_scratch_location
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                warp_scan =
+                    details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+                if (warp.thread_rank() + 1 == warp.size()) {
+                    *warp_scratch_location = warp_scan;
+                }
+                if (TyScan == ScanType::exclusive) {
+                    warp_scan = warp.shfl_up(warp_scan, 1);
+                }
+            };
+            // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
+            // to its in-warp scan result
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    auto thread_val = *thread_scratch_location;
+                    auto result = coalesced_inclusive_scan(subwarp, thread_val, op);
+                    *thread_scratch_location = convert_inclusive_to_exclusive(subwarp, result, thread_val, op);
+            };
+            TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+            if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
+                return previous_warps_sum;
+            }
+            if (warpType::meta_group_rank() == 0) {
+                return warp_scan;
+            }
+            else {
+                return op(warp_scan, previous_warps_sum);
+            }
+        }
+    };
+#if defined(_CG_HAS_STL_ATOMICS)
+    template <unsigned int GroupId,  ScanType TyScan>
+    struct scan_update_dispatch;
+    template <ScanType TyScan>
+    struct scan_update_dispatch<details::coalesced_group_id, TyScan> {
+        template <typename TyGroup, typename TyAtomic, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const TyGroup& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            details::remove_qual<TyVal> old;
+            // Do regular in group scan
+            auto scan_result = details::coalesced_inclusive_scan(group, val, op);
+            // Last thread updates the atomic and distributes its old value to other threads
+            if (group.thread_rank() == group.size() - 1) {
+                old = atomic_update(dst, scan_result, _CG_STL_NAMESPACE::forward<TyFn>(op));
+            }
+            old = group.shfl(old, group.size() - 1);
+            if (TyScan == ScanType::exclusive) {
+                scan_result = convert_inclusive_to_exclusive(group, scan_result, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+            }
+            scan_result = op(old, scan_result);
+            return scan_result;
+        }
+    };
+    template <ScanType TyScan>
+    struct scan_update_dispatch<details::multi_tile_group_id, TyScan> {
+        template <unsigned int Size, typename ParentT, typename TyAtomic, typename TyVal, typename TyFn>
+        _CG_STATIC_QUALIFIER auto scan(const thread_block_tile<Size, ParentT>& group, TyAtomic& dst, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+            using warpType = details::internal_thread_block_tile<32, __static_size_multi_warp_tile_base<Size>>;
+            using TyRet = details::remove_qual<TyVal>;
+            const unsigned int num_warps = Size / 32;
+            // In warp scan result, calculated in warp_lambda
+            TyRet warp_scan;
+            // In warp scan, put sum in the warp_scratch_location
+            auto warp_lambda = [&] (const warpType& warp, TyRet* warp_scratch_location) {
+                warp_scan =
+                    details::coalesced_inclusive_scan(warp, _CG_STL_NAMESPACE::forward<TyVal>(val), op);
+                if (warp.thread_rank() + 1 == warp.size()) {
+                    *warp_scratch_location = warp_scan;
+                }
+                if (TyScan == ScanType::exclusive) {
+                    warp_scan = warp.shfl_up(warp_scan, 1);
+                }
+            };
+            // Tile of size num_warps performing the final scan part (exclusive scan of warp sums), other threads will add it
+            // to its in-warp scan result
+            auto inter_warp_lambda =
+                [&] (const details::internal_thread_block_tile<num_warps, warpType>& subwarp, TyRet* thread_scratch_location) {
+                    auto thread_val = *thread_scratch_location;
+                    auto scan_result = details::coalesced_inclusive_scan(subwarp, thread_val, op);
+                    TyRet offset;
+                    // Single thread does the atomic update with sum of all contributions and reads the old value.
+                    if (subwarp.thread_rank() == subwarp.size() - 1) {
+                        offset = details::atomic_update(dst, scan_result, op);
+                    }
+                    offset = subwarp.shfl(offset, subwarp.size() - 1);
+                    scan_result = convert_inclusive_to_exclusive(subwarp, scan_result, thread_val, op);
+                    // Add offset read from the atomic to the scanned warp sum.
+                    // Skipping first thread, since it got defautly constructed value from the conversion,
+                    // it should just return the offset received from the thread that did the atomic update.
+                    if (subwarp.thread_rank() != 0) {
+                        offset = op(scan_result, offset);
+                    }
+                    *thread_scratch_location = offset;
+            };
+            TyRet previous_warps_sum = details::multi_warp_collectives_helper<TyRet>(group, warp_lambda, inter_warp_lambda);
+            if (TyScan == ScanType::exclusive && warpType::thread_rank() == 0) {
+                return previous_warps_sum;
+            }
+            return op(warp_scan, previous_warps_sum);
+        }
+    };
+#endif
+#endif
+    template <typename TyGroup, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_scan_params() {
+        static_assert(details::is_op_type_same<TyInputVal, TyRetVal>::value, "Operator input and output types differ");
+        static_assert(details::scan_group_supported<TyGroup>::value, "This group does not exclusively represent a tile");
+    }
+#if defined(_CG_HAS_STL_ATOMICS)
+    template <typename TyGroup, typename TyDstVal, typename TyInputVal, typename TyRetVal>
+    _CG_QUALIFIER void check_scan_update_params() {
+        check_scan_params<TyGroup, TyInputVal, TyRetVal>();
+        static_assert(details::is_op_type_same<TyDstVal, TyInputVal>::value, "Destination and input types differ");
+    }
+#endif
+} // details
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
+    using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+template <typename TyGroup, typename TyVal>
+_CG_QUALIFIER details::remove_qual<TyVal> inclusive_scan(const TyGroup& group, TyVal&& val) {
+    return inclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
+}
+template <typename TyGroup, typename TyVal, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan(const TyGroup& group, TyVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_params<TyGroup, TyVal, decltype(op(val, val))>();
+    using dispatch = details::scan_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+template <typename TyGroup, typename TyVal>
+_CG_QUALIFIER details::remove_qual<TyVal> exclusive_scan(const TyGroup& group, TyVal&& val) {
+    return exclusive_scan(group, _CG_STL_NAMESPACE::forward<TyVal>(val), cooperative_groups::plus<details::remove_qual<TyVal>>());
+}
+#if defined(_CG_HAS_STL_ATOMICS)
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco> & dst, TyInputVal&& val) {
+    return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, cuda::atomic<TyVal, Sco>& dst, TyInputVal&& val) {
+    return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::inclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal inclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco> & dst, TyInputVal&& val) {
+    return inclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco, typename TyFn>
+_CG_QUALIFIER auto exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val, TyFn&& op) -> decltype(op(val, val)) {
+    details::check_scan_update_params<TyGroup, TyVal, details::remove_qual<TyInputVal>, decltype(op(val, val))>();
+    using dispatch = details::scan_update_dispatch<TyGroup::_group_id, details::ScanType::exclusive>;
+    return dispatch::scan(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), _CG_STL_NAMESPACE::forward<TyFn>(op));
+}
+template<typename TyGroup, typename TyVal, typename TyInputVal, cuda::thread_scope Sco>
+_CG_QUALIFIER TyVal exclusive_scan_update(const TyGroup& group, const cuda::atomic_ref<TyVal, Sco>& dst, TyInputVal&& val) {
+    return exclusive_scan_update(group, dst, _CG_STL_NAMESPACE::forward<TyInputVal>(val), cooperative_groups::plus<TyVal>());
+}
+#endif
+_CG_END_NAMESPACE
+#endif // _CG_SCAN_H_

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/include/cooperative_groups/details/sync.h ADDED Viewed

	@@ -0,0 +1,282 @@

+ /* Copyright 1993-2016 NVIDIA Corporation.  All rights reserved.
+  *
+  * NOTICE TO LICENSEE:
+  *
+  * The source code and/or documentation ("Licensed Deliverables") are
+  * subject to NVIDIA intellectual property rights under U.S. and
+  * international Copyright laws.
+  *
+  * The Licensed Deliverables contained herein are PROPRIETARY and
+  * CONFIDENTIAL to NVIDIA and are being provided under the terms and
+  * conditions of a form of NVIDIA software license agreement by and
+  * between NVIDIA and Licensee ("License Agreement") or electronically
+  * accepted by Licensee.  Notwithstanding any terms or conditions to
+  * the contrary in the License Agreement, reproduction or disclosure
+  * of the Licensed Deliverables to any third party without the express
+  * written consent of NVIDIA is prohibited.
+  *
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+  * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  THEY ARE
+  * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+  * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+  * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+  * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+  * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+  * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+  * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+  * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+  * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+  * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+  * OF THESE LICENSED DELIVERABLES.
+  *
+  * U.S. Government End Users.  These Licensed Deliverables are a
+  * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+  * 1995), consisting of "commercial computer software" and "commercial
+  * computer software documentation" as such terms are used in 48
+  * C.F.R. 12.212 (SEPT 1995) and are provided to the U.S. Government
+  * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+  * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+  * U.S. Government End Users acquire the Licensed Deliverables with
+  * only those rights set forth herein.
+  *
+  * Any use of the Licensed Deliverables in individual and commercial
+  * software must include, in the user documentation and internal
+  * comments to the code, the above Disclaimer and U.S. Government End
+  * Users Notice.
+  */
+#ifndef _CG_GRID_H
+#define _CG_GRID_H
+#include "info.h"
+_CG_BEGIN_NAMESPACE
+namespace details
+{
+typedef unsigned int barrier_t;
+_CG_STATIC_QUALIFIER bool bar_has_flipped(unsigned int old_arrive, unsigned int current_arrive) {
+    return (((old_arrive ^ current_arrive) & 0x80000000) != 0);
+}
+_CG_STATIC_QUALIFIER bool is_cta_master() {
+    return (threadIdx.x + threadIdx.y + threadIdx.z == 0);
+}
+_CG_STATIC_QUALIFIER unsigned int sync_grids_arrive(volatile barrier_t *arrived) {
+    unsigned int oldArrive = 0;
+    __barrier_sync(0);
+    if (is_cta_master()) {
+        unsigned int expected = gridDim.x * gridDim.y * gridDim.z;
+        bool gpu_master = (blockIdx.x + blockIdx.y + blockIdx.z == 0);
+        unsigned int nb = 1;
+        if (gpu_master) {
+            nb = 0x80000000 - (expected - 1);
+        }
+#if __CUDA_ARCH__ < 700
+        // Fence; barrier update; volatile polling; fence
+        __threadfence();
+        oldArrive = atomicAdd((unsigned int*)arrived, nb);
+#else
+        // Barrier update with release; polling with acquire
+        asm volatile("atom.add.release.gpu.u32 %0,[%1],%2;" : "=r"(oldArrive) : _CG_ASM_PTR_CONSTRAINT((unsigned int*)arrived), "r"(nb) : "memory");
+#endif
+    }
+    return oldArrive;
+}
+_CG_STATIC_QUALIFIER void sync_grids_wait(unsigned int oldArrive, volatile barrier_t *arrived) {
+    if (is_cta_master()) {
+#if __CUDA_ARCH__ < 700
+        while (!bar_has_flipped(oldArrive, *arrived));
+        __threadfence();
+#else
+        unsigned int current_arrive;
+        do {
+            asm volatile("ld.acquire.gpu.u32 %0,[%1];" : "=r"(current_arrive) : _CG_ASM_PTR_CONSTRAINT((unsigned int *)arrived) : "memory");
+        } while (!bar_has_flipped(oldArrive, current_arrive));
+#endif
+    }
+    __barrier_sync(0);
+}
+/* - Multi warp groups synchronization routines - */
+// Need both acquire and release for the last warp, since it won't be able to acquire with red.and
+_CG_STATIC_QUALIFIER unsigned int atom_or_acq_rel_cta(unsigned int *addr, unsigned int val) {
+    unsigned int old;
+#if __CUDA_ARCH__ < 700
+    __threadfence_block();
+    old = atomicOr(addr, val);
+#else
+    asm volatile("atom.or.acq_rel.cta.b32 %0,[%1],%2;" : "=r"(old) : _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");
+#endif
+    return old;
+}
+// Special case where barrier is arrived, but not waited on
+_CG_STATIC_QUALIFIER void red_or_release_cta(unsigned int *addr, unsigned int val) {
+#if __CUDA_ARCH__ < 700
+    __threadfence_block();
+    atomicOr(addr, val);
+#else
+    asm volatile("red.or.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");
+#endif
+}
+// Usually called by last arriving warp to released other warps, can be relaxed, since or was already acq_rel
+_CG_STATIC_QUALIFIER void red_and_relaxed_cta(unsigned int *addr, unsigned int val) {
+#if __CUDA_ARCH__ < 700
+    atomicAnd(addr, val);
+#else
+    asm volatile("red.and.relaxed.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");
+#endif
+}
+// Special case of release, where last warp was doing extra work before releasing others, need to be release
+//  to ensure that extra work is visible
+_CG_STATIC_QUALIFIER void red_and_release_cta(unsigned int *addr, unsigned int val) {
+#if __CUDA_ARCH__ < 700
+    __threadfence_block();
+    atomicAnd(addr, val);
+#else
+    asm volatile("red.and.release.cta.b32 [%0],%1;" :: _CG_ASM_PTR_CONSTRAINT(addr), "r"(val) : "memory");
+#endif
+}
+// Read the barrier, acquire to ensure all memory operations following the sync are correctly performed after it is released
+_CG_STATIC_QUALIFIER unsigned int ld_acquire_cta(unsigned int *addr) {
+    unsigned int val;
+#if __CUDA_ARCH__ < 700
+    val = *((volatile unsigned int*) addr);
+    __threadfence_block();
+#else
+    asm volatile("ld.acquire.cta.u32 %0,[%1];" : "=r"(val) : _CG_ASM_PTR_CONSTRAINT(addr) : "memory");
+#endif
+    return val;
+}
+// Get synchronization bit mask of my thread_block_tile of size num_warps. Thread ranks 0..31 have the first bit assigned to them,
+// thread ranks 32..63 second etc
+// Bit masks are unique for each group, groups of the same size will have the same number of bits set, but on different positions
+_CG_STATIC_QUALIFIER unsigned int get_group_mask(unsigned int thread_rank, unsigned int num_warps) {
+    return num_warps == 32 ? ~0 : ((1 << num_warps) - 1) << (num_warps * (thread_rank / (num_warps * 32)));
+}
+_CG_STATIC_QUALIFIER void barrier_wait(barrier_t *arrived, unsigned int warp_bit) {
+    while(ld_acquire_cta(arrived) & warp_bit);
+}
+// Default blocking sync.
+_CG_STATIC_QUALIFIER void sync_warps(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+    __syncwarp(0xFFFFFFFF);
+    if (warp_master) {
+        unsigned int old = atom_or_acq_rel_cta(arrived, warp_bit);
+        if (((old | warp_bit) & group_mask) == group_mask) {
+            red_and_relaxed_cta(arrived, ~group_mask);
+        }
+        else {
+            barrier_wait(arrived, warp_bit);
+        }
+    }
+    __syncwarp(0xFFFFFFFF);
+}
+// Blocking sync, except the last arriving warp, that releases other warps, returns to do other stuff first.
+// Warp returning true from this function needs to call sync_warps_release.
+_CG_STATIC_QUALIFIER bool sync_warps_last_releases(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+    __syncwarp(0xFFFFFFFF);
+    unsigned int old = 0;
+    if (warp_master) {
+        old = atom_or_acq_rel_cta(arrived, warp_bit);
+    }
+    old = __shfl_sync(0xFFFFFFFF, old, 0);
+    if (((old | warp_bit) & group_mask) == group_mask) {
+        return true;
+    }
+    barrier_wait(arrived, warp_bit);
+    return false;
+}
+// Release my group from the barrier.
+_CG_STATIC_QUALIFIER void sync_warps_release(barrier_t *arrived, bool is_master, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+    if (is_master) {
+        red_and_release_cta(arrived, ~group_mask);
+    }
+}
+// Arrive at my group barrier, but don't block or release the barrier, even if every one arrives.
+// sync_warps_release needs to be called by some warp after this one to reset the barrier.
+_CG_STATIC_QUALIFIER void sync_warps_arrive(barrier_t *arrived, unsigned int thread_rank, unsigned int num_warps) {
+    unsigned int warp_id = thread_rank / 32;
+    bool warp_master = (thread_rank % 32 == 0);
+    unsigned int warp_bit = 1 << warp_id;
+    unsigned int group_mask = get_group_mask(thread_rank, num_warps);
+    __syncwarp(0xFFFFFFFF);
+    if (warp_master) {
+        red_or_release_cta(arrived, warp_bit);
+    }
+}
+// Wait for my warp to be released from the barrier. Warp must have arrived first.
+_CG_STATIC_QUALIFIER void sync_warps_wait(barrier_t *arrived, unsigned int thread_rank) {
+    unsigned int warp_id = thread_rank / 32;
+    unsigned int warp_bit = 1 << warp_id;
+    barrier_wait(arrived, warp_bit);
+}
+// Wait for specific warp to arrive at the barrier
+_CG_QUALIFIER void sync_warps_wait_for_specific_warp(barrier_t *arrived, unsigned int wait_warp_id) {
+    unsigned int wait_mask = 1 << wait_warp_id;
+    while((ld_acquire_cta(arrived) & wait_mask) != wait_mask);
+}
+// Initialize the bit corresponding to my warp in the barrier
+_CG_QUALIFIER void sync_warps_reset(barrier_t *arrived, unsigned int thread_rank) {
+    unsigned int warp_id = thread_rank / 32;
+    unsigned int warp_bit = 1 << warp_id;
+    __syncwarp(0xFFFFFFFF);
+    if (thread_rank % 32 == 0) {
+        red_and_release_cta(arrived, ~warp_bit);
+    }
+    // No need to sync after the atomic, there will be a sync of the group that is being partitioned right after this.
+}
+} // details
+_CG_END_NAMESPACE
+#endif // _CG_GRID_H

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (196 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/cuda_runtime/lib/libcudart.so.12 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8774224f5b11a73b15d074a3fcce7327322c5c4cfdfd924d6a826779eec968fe
+size 707904

.venv/lib/python3.11/site-packages/nvidia/cudnn/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn.h ADDED Viewed

	@@ -0,0 +1,68 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cudnn : Neural Networks Library  */
+#if !defined(CUDNN_H_)
+#define CUDNN_H_
+#if defined(__cplusplus)
+extern "C" {
+#endif
+#include <cuda_runtime_api.h>
+#include "cudnn_version.h"
+#include "cudnn_graph.h"
+#include "cudnn_ops.h"
+#include "cudnn_adv.h"
+#include "cudnn_cnn.h"
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_H_ */

.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_adv_v9.h ADDED Viewed

	@@ -0,0 +1,671 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*   cudnn_adv : cuDNN's advanced and experimental features.
+*/
+#if !defined(CUDNN_ADV_H_)
+#define CUDNN_ADV_H_
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_ops.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_ADV_MAJOR 9
+#define CUDNN_ADV_MINOR 1
+#define CUDNN_ADV_PATCH 0
+#if (CUDNN_ADV_MAJOR != CUDNN_MAJOR) || (CUDNN_ADV_MINOR != CUDNN_MINOR) || (CUDNN_ADV_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN ADV INFER!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* BASIC RNN API */
+typedef enum {
+    CUDNN_RNN_ALGO_STANDARD               = 0,
+    CUDNN_RNN_ALGO_PERSIST_STATIC         = 1,
+    CUDNN_RNN_ALGO_PERSIST_DYNAMIC        = 2,
+    CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H = 3,
+    CUDNN_RNN_ALGO_COUNT                  = 4,
+} cudnnRNNAlgo_t;
+typedef enum {
+    CUDNN_FWD_MODE_INFERENCE = 0,
+    CUDNN_FWD_MODE_TRAINING  = 1,
+} cudnnForwardMode_t;
+typedef enum {
+    CUDNN_RNN_RELU = 0, /* basic RNN cell type with ReLu activation */
+    CUDNN_RNN_TANH = 1, /* basic RNN cell type with tanh activation */
+    CUDNN_LSTM     = 2, /* LSTM with optional recurrent projection and clipping */
+    CUDNN_GRU      = 3, /* Using h' = tanh(r * Uh(t-1) + Wx) and h = (1 - z) * h' + z * h(t-1); */
+} cudnnRNNMode_t;
+typedef enum {
+    CUDNN_RNN_NO_BIAS         = 0, /* rnn cell formulas do not use biases */
+    CUDNN_RNN_SINGLE_INP_BIAS = 1, /* rnn cell formulas use one input bias in input GEMM */
+    CUDNN_RNN_DOUBLE_BIAS     = 2, /* default, rnn cell formulas use two bias vectors */
+    CUDNN_RNN_SINGLE_REC_BIAS = 3  /* rnn cell formulas use one recurrent bias in recurrent GEMM */
+} cudnnRNNBiasMode_t;
+typedef enum {
+    CUDNN_UNIDIRECTIONAL = 0, /* single direction network */
+    CUDNN_BIDIRECTIONAL  = 1, /* output concatination at each layer */
+} cudnnDirectionMode_t;
+typedef enum {
+    CUDNN_LINEAR_INPUT = 0, /* adjustable weight matrix in first layer input GEMM */
+    CUDNN_SKIP_INPUT   = 1, /* fixed identity matrix in the first layer input GEMM */
+} cudnnRNNInputMode_t;
+typedef enum {
+    CUDNN_RNN_CLIP_NONE   = 0, /* disables LSTM cell clipping */
+    CUDNN_RNN_CLIP_MINMAX = 1, /* enables LSTM cell clipping */
+} cudnnRNNClipMode_t;
+typedef enum {
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED   = 0, /* padded, outer stride from one time-step to the next */
+    CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED     = 1, /* sequence length sorted and packed as in basic RNN api */
+    CUDNN_RNN_DATA_LAYOUT_BATCH_MAJOR_UNPACKED = 2, /* padded, outer stride from one batch to the next */
+} cudnnRNNDataLayout_t;
+/* For auxFlags in cudnnSetRNNDescriptor_v8() */
+#define CUDNN_RNN_PADDED_IO_DISABLED 0
+#define CUDNN_RNN_PADDED_IO_ENABLED (1U << 0)
+struct cudnnRNNStruct;
+typedef struct cudnnRNNStruct *cudnnRNNDescriptor_t;
+struct cudnnRNNDataStruct;
+typedef struct cudnnRNNDataStruct *cudnnRNNDataDescriptor_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDescriptor(cudnnRNNDescriptor_t *rnnDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDescriptor(cudnnRNNDescriptor_t rnnDesc);
+/*
+ * mathPrec in cudnnSetRNNDescriptor_v8() specifies compute precision.
+ * Compute precision is further modified by mathType that sets the
+ * preferred option for using NVIDIA Tensor Cores.  dataType specify
+ * input/output data type and weight/bias type.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t algo,
+                         cudnnRNNMode_t cellMode,
+                         cudnnRNNBiasMode_t biasMode,
+                         cudnnDirectionMode_t dirMode,
+                         cudnnRNNInputMode_t inputMode,
+                         cudnnDataType_t dataType,
+                         cudnnDataType_t mathPrec,
+                         cudnnMathType_t mathType,
+                         int32_t inputSize,
+                         int32_t hiddenSize,
+                         int32_t projSize,
+                         int32_t numLayers,
+                         cudnnDropoutDescriptor_t dropoutDesc,
+                         uint32_t auxFlags);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDescriptor_v8(cudnnRNNDescriptor_t rnnDesc,
+                         cudnnRNNAlgo_t *algo,
+                         cudnnRNNMode_t *cellMode,
+                         cudnnRNNBiasMode_t *biasMode,
+                         cudnnDirectionMode_t *dirMode,
+                         cudnnRNNInputMode_t *inputMode,
+                         cudnnDataType_t *dataType,
+                         cudnnDataType_t *mathPrec,
+                         cudnnMathType_t *mathType,
+                         int32_t *inputSize,
+                         int32_t *hiddenSize,
+                         int32_t *projSize,
+                         int32_t *numLayers,
+                         cudnnDropoutDescriptor_t *dropoutDesc,
+                         uint32_t *auxFlags);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t clipMode,
+                   cudnnNanPropagation_t clipNanOpt,
+                   double lclip,
+                   double rclip);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNSetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t clipMode, double lclip, double rclip);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v8(cudnnRNNDescriptor_t rnnDesc,
+                   cudnnRNNClipMode_t *clipMode,
+                   cudnnNanPropagation_t *clipNanOpt,
+                   double *lclip,
+                   double *rclip);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNGetClip_v9(cudnnRNNDescriptor_t rnnDesc, cudnnRNNClipMode_t *clipMode, double *lclip, double *rclip);
+cudnnStatus_t CUDNNWINAPI
+cudnnBuildRNNDynamic(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, int miniBatch);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNTempSpaceSizes(cudnnHandle_t handle,
+                          cudnnRNNDescriptor_t rnnDesc,
+                          cudnnForwardMode_t fwdMode,
+                          cudnnRNNDataDescriptor_t xDesc,
+                          size_t *workSpaceSize,
+                          size_t *reserveSpaceSize);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightSpaceSize(cudnnHandle_t handle, cudnnRNNDescriptor_t rnnDesc, size_t *weightSpaceSize);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNWeightParams(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        int32_t pseudoLayer,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        int32_t linLayerID,
+                        cudnnTensorDescriptor_t mDesc,
+                        void **mAddr,
+                        cudnnTensorDescriptor_t bDesc,
+                        void **bAddr);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateRNNDataDescriptor(cudnnRNNDataDescriptor_t *rnnDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t dataType,
+                          cudnnRNNDataLayout_t layout,
+                          int maxSeqLength,
+                          int batchSize,
+                          int vectorSize,
+                          const int seqLengthArray[], /* length of each sequence in the batch */
+                          void *paddingFill);         /* symbol for filling padding position in output */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetRNNDataDescriptor(cudnnRNNDataDescriptor_t rnnDataDesc,
+                          cudnnDataType_t *dataType,
+                          cudnnRNNDataLayout_t *layout,
+                          int *maxSeqLength,
+                          int *batchSize,
+                          int *vectorSize,
+                          int arrayLengthRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNForward(cudnnHandle_t handle,
+                cudnnRNNDescriptor_t rnnDesc,
+                cudnnForwardMode_t fwdMode,
+                const int32_t devSeqLengths[],
+                cudnnRNNDataDescriptor_t xDesc,
+                const void *x,
+                cudnnRNNDataDescriptor_t yDesc,
+                void *y,
+                cudnnTensorDescriptor_t hDesc,
+                const void *hx,
+                void *hy,
+                cudnnTensorDescriptor_t cDesc,
+                const void *cx,
+                void *cy,
+                size_t weightSpaceSize,
+                const void *weightSpace,
+                size_t workSpaceSize,
+                void *workSpace,
+                size_t reserveSpaceSize,
+                void *reserveSpace);
+/* Sequence data descriptor */
+typedef enum {
+    CUDNN_SEQDATA_TIME_DIM  = 0, /* index in time */
+    CUDNN_SEQDATA_BATCH_DIM = 1, /* index in batch */
+    CUDNN_SEQDATA_BEAM_DIM  = 2, /* index in beam */
+    CUDNN_SEQDATA_VECT_DIM  = 3  /* index in vector */
+} cudnnSeqDataAxis_t;
+struct cudnnSeqDataStruct;
+typedef struct cudnnSeqDataStruct *cudnnSeqDataDescriptor_t CUDNN_DEPRECATED;
+#define CUDNN_SEQDATA_DIM_COUNT 4 /* dimension count */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateSeqDataDescriptor(cudnnSeqDataDescriptor_t *seqDataDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroySeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetSeqDataDescriptor(cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t dataType,
+                          int nbDims,
+                          const int dimA[],
+                          const cudnnSeqDataAxis_t axes[],
+                          size_t seqLengthArraySize,
+                          const int seqLengthArray[],
+                          void *paddingFill);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetSeqDataDescriptor(const cudnnSeqDataDescriptor_t seqDataDesc,
+                          cudnnDataType_t *dataType,
+                          int *nbDims,
+                          int nbDimsRequested,
+                          int dimA[],
+                          cudnnSeqDataAxis_t axes[],
+                          size_t *seqLengthArraySize,
+                          size_t seqLengthSizeRequested,
+                          int seqLengthArray[],
+                          void *paddingFill);
+/* Multihead Attention */
+/*
+ * Multi-head attention options passed via 'attnMode' in cudnnSetAttnDescriptor().
+ * Use the bitwise OR operator to combine several settings listed below.  Additional
+ * minor options can be added here w/o changing or introducing new API functions.
+ */
+#define CUDNN_ATTN_QUERYMAP_ALL_TO_ONE 0         /* multiple Q-s map to a single (K,V) set when beam size > 1 */
+#define CUDNN_ATTN_QUERYMAP_ONE_TO_ONE (1U << 0) /* multiple Q-s map to multiple (K,V) sets when beam size > 1 */
+#define CUDNN_ATTN_DISABLE_PROJ_BIASES 0         /* no biases in attention input and output projections */
+#define CUDNN_ATTN_ENABLE_PROJ_BIASES (1U << 1)  /* use biases in attention input and output projections */
+struct cudnnAttnStruct;
+typedef struct cudnnAttnStruct *cudnnAttnDescriptor_t CUDNN_DEPRECATED;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateAttnDescriptor(cudnnAttnDescriptor_t *attnDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyAttnDescriptor(cudnnAttnDescriptor_t attnDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned attnMode,
+                       int nHeads,
+                       double smScaler,
+                       cudnnDataType_t dataType,
+                       cudnnDataType_t computePrec,
+                       cudnnMathType_t mathType,
+                       cudnnDropoutDescriptor_t attnDropoutDesc,
+                       cudnnDropoutDescriptor_t postDropoutDesc,
+                       int qSize,
+                       int kSize,
+                       int vSize,
+                       int qProjSize,
+                       int kProjSize,
+                       int vProjSize,
+                       int oProjSize,
+                       int qoMaxSeqLength,
+                       int kvMaxSeqLength,
+                       int maxBatchSize,
+                       int maxBeamSize);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetAttnDescriptor(cudnnAttnDescriptor_t attnDesc,
+                       unsigned *attnMode,
+                       int *nHeads,
+                       double *smScaler,
+                       cudnnDataType_t *dataType,
+                       cudnnDataType_t *computePrec,
+                       cudnnMathType_t *mathType,
+                       cudnnDropoutDescriptor_t *attnDropoutDesc,
+                       cudnnDropoutDescriptor_t *postDropoutDesc,
+                       int *qSize,
+                       int *kSize,
+                       int *vSize,
+                       int *qProjSize,
+                       int *kProjSize,
+                       int *vProjSize,
+                       int *oProjSize,
+                       int *qoMaxSeqLength,
+                       int *kvMaxSeqLength,
+                       int *maxBatchSize,
+                       int *maxBeamSize);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnBuffers(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             size_t *weightSizeInBytes,
+                             size_t *workSpaceSizeInBytes,
+                             size_t *reserveSpaceSizeInBytes);
+typedef enum {
+    CUDNN_MH_ATTN_Q_WEIGHTS = 0, /* input projection weights for 'queries' */
+    CUDNN_MH_ATTN_K_WEIGHTS = 1, /* input projection weights for 'keys' */
+    CUDNN_MH_ATTN_V_WEIGHTS = 2, /* input projection weights for 'values' */
+    CUDNN_MH_ATTN_O_WEIGHTS = 3, /* output projection weights */
+    CUDNN_MH_ATTN_Q_BIASES  = 4, /* input projection bias tensor for 'queries' */
+    CUDNN_MH_ATTN_K_BIASES  = 5, /* input projection bias for 'keys' */
+    CUDNN_MH_ATTN_V_BIASES  = 6, /* input projection bias for 'values' */
+    CUDNN_MH_ATTN_O_BIASES  = 7, /* output projection biases */
+} cudnnMultiHeadAttnWeightKind_t;
+#define CUDNN_ATTN_WKIND_COUNT 8 /* Number of attention weight/bias tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetMultiHeadAttnWeights(cudnnHandle_t handle,
+                             const cudnnAttnDescriptor_t attnDesc,
+                             cudnnMultiHeadAttnWeightKind_t wKind,
+                             size_t weightSizeInBytes,
+                             const void *weights,
+                             cudnnTensorDescriptor_t wDesc,
+                             void **wAddr);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnForward(cudnnHandle_t handle,
+                          const cudnnAttnDescriptor_t attnDesc,
+                          int currIdx,
+                          const int loWinIdx[],
+                          const int hiWinIdx[],
+                          const int devSeqLengthsQO[],
+                          const int devSeqLengthsKV[],
+                          const cudnnSeqDataDescriptor_t qDesc,
+                          const void *queries,
+                          const void *residuals,
+                          const cudnnSeqDataDescriptor_t kDesc,
+                          const void *keys,
+                          const cudnnSeqDataDescriptor_t vDesc,
+                          const void *values,
+                          const cudnnSeqDataDescriptor_t oDesc,
+                          void *out,
+                          size_t weightSizeInBytes,
+                          const void *weights,
+                          size_t workSpaceSizeInBytes,
+                          void *workSpace,
+                          size_t reserveSpaceSizeInBytes,
+                          void *reserveSpace);
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnAdvVersionCheck(void);
+typedef enum {
+    CUDNN_WGRAD_MODE_ADD = 0, /* add partial gradients to wgrad output buffers */
+    CUDNN_WGRAD_MODE_SET = 1, /* write partial gradients to wgrad output buffers */
+} cudnnWgradMode_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardData_v8(cudnnHandle_t handle,
+                        cudnnRNNDescriptor_t rnnDesc,
+                        const int32_t devSeqLengths[],
+                        cudnnRNNDataDescriptor_t yDesc,
+                        const void *y,
+                        const void *dy,
+                        cudnnRNNDataDescriptor_t xDesc,
+                        void *dx,
+                        cudnnTensorDescriptor_t hDesc,
+                        const void *hx,
+                        const void *dhy,
+                        void *dhx,
+                        cudnnTensorDescriptor_t cDesc,
+                        const void *cx,
+                        const void *dcy,
+                        void *dcx,
+                        size_t weightSpaceSize,
+                        const void *weightSpace,
+                        size_t workSpaceSize,
+                        void *workSpace,
+                        size_t reserveSpaceSize,
+                        void *reserveSpace);
+cudnnStatus_t CUDNNWINAPI
+cudnnRNNBackwardWeights_v8(cudnnHandle_t handle,
+                           cudnnRNNDescriptor_t rnnDesc,
+                           cudnnWgradMode_t addGrad,
+                           const int32_t devSeqLengths[],
+                           cudnnRNNDataDescriptor_t xDesc,
+                           const void *x,
+                           cudnnTensorDescriptor_t hDesc,
+                           const void *hx,
+                           cudnnRNNDataDescriptor_t yDesc,
+                           const void *y,
+                           size_t weightSpaceSize,
+                           void *dweightSpace,
+                           size_t workSpaceSize,
+                           void *workSpace,
+                           size_t reserveSpaceSize,
+                           void *reserveSpace);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardData(cudnnHandle_t handle,
+                               const cudnnAttnDescriptor_t attnDesc,
+                               const int loWinIdx[],
+                               const int hiWinIdx[],
+                               const int devSeqLengthsDQDO[],
+                               const int devSeqLengthsDKDV[],
+                               const cudnnSeqDataDescriptor_t doDesc,
+                               const void *dout,
+                               const cudnnSeqDataDescriptor_t dqDesc,
+                               void *dqueries,
+                               const void *queries,
+                               const cudnnSeqDataDescriptor_t dkDesc,
+                               void *dkeys,
+                               const void *keys,
+                               const cudnnSeqDataDescriptor_t dvDesc,
+                               void *dvalues,
+                               const void *values,
+                               size_t weightSizeInBytes,
+                               const void *weights,
+                               size_t workSpaceSizeInBytes,
+                               void *workSpace,
+                               size_t reserveSpaceSizeInBytes,
+                               void *reserveSpace);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnMultiHeadAttnBackwardWeights(cudnnHandle_t handle,
+                                  const cudnnAttnDescriptor_t attnDesc,
+                                  cudnnWgradMode_t addGrad,
+                                  const cudnnSeqDataDescriptor_t qDesc,
+                                  const void *queries,
+                                  const cudnnSeqDataDescriptor_t kDesc,
+                                  const void *keys,
+                                  const cudnnSeqDataDescriptor_t vDesc,
+                                  const void *values,
+                                  const cudnnSeqDataDescriptor_t doDesc,
+                                  const void *dout,
+                                  size_t weightSizeInBytes,
+                                  const void *weights,
+                                  void *dweights,
+                                  size_t workSpaceSizeInBytes,
+                                  void *workSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  void *reserveSpace);
+/*
+* CTC (Connectionist Temporal Classification) loss descriptor create/destory/set/get functions
+*/
+/* Input normalization mode for loss function */
+typedef enum {
+    CUDNN_LOSS_NORMALIZATION_NONE    = 0,
+    CUDNN_LOSS_NORMALIZATION_SOFTMAX = 1,
+} cudnnLossNormalizationMode_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateCTCLossDescriptor(cudnnCTCLossDescriptor_t *ctcLossDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t compType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t compType,
+                            cudnnLossNormalizationMode_t normMode,
+                            cudnnNanPropagation_t gradMode);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnNanPropagation_t gradMode,
+                             int maxLabelLength);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t compType,
+                             cudnnLossNormalizationMode_t normMode,
+                             cudnnCTCGradMode_t ctcGradMode,
+                             int maxLabelLength);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc, cudnnDataType_t *compType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptorEx(cudnnCTCLossDescriptor_t ctcLossDesc,
+                            cudnnDataType_t *compType,
+                            cudnnLossNormalizationMode_t *normMode,
+                            cudnnNanPropagation_t *gradMode);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v8(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnNanPropagation_t *gradMode,
+                             int *maxLabelLength);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossDescriptor_v9(cudnnCTCLossDescriptor_t ctcLossDesc,
+                             cudnnDataType_t *compType,
+                             cudnnLossNormalizationMode_t *normMode,
+                             cudnnCTCGradMode_t *ctcGradMode,
+                             int *maxLabelLength);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyCTCLossDescriptor(cudnnCTCLossDescriptor_t ctcLossDesc);
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int hostLabels[],                      /* labels, in CPU memory */
+    const int hostLabelLengths[],                /* the length of each label, in CPU memory */
+    const int hostInputLengths[],                /* the lengths of timing steps in each batch, in CPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,         /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    void *workspace,              /* pointer to the workspace, in GPU memory */
+    size_t workSpaceSizeInBytes); /* size of the workspace */
+/* return the ctc costs and gradients, given the probabilities and labels */
+cudnnStatus_t CUDNNWINAPI
+cudnnCTCLoss_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t
+        probsDesc,     /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the timing steps, N is the
+                          mini batch size, A is the alphabet size)  */
+    const void *probs, /* probabilities after softmax, in GPU memory */
+    const int labels[],                          /* labels, in GPU memory */
+    const int labelLengths[],                    /* the length of each label, in GPU memory */
+    const int inputLengths[],                    /* the lengths of timing steps in each batch, in GPU memory */
+    void *costs,                                 /* the returned costs of CTC, in GPU memory */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the dimensions are T,N,A */
+    void *gradients,             /* the returned CTC gradients, in GPU memory, to compute costs only, set it to NULL */
+    size_t workSpaceSizeInBytes, /* size of the workspace */
+    void *workspace);            /* pointer to the workspace, in GPU memory */
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize(
+    cudnnHandle_t handle,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    const int *labels,                           /* labels, in CPU memory */
+    const int *labelLengths,                     /* the length of each label, in CPU memory */
+    const int *inputLengths,                     /* the lengths of timing steps in each batch, in CPU memory */
+    cudnnCTCLossAlgo_t algo,                     /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    size_t *sizeInBytes); /* pointer to the returned workspace size */
+/* return the workspace size needed for ctc */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCTCLossWorkspaceSize_v8(
+    cudnnHandle_t handle,
+    cudnnCTCLossAlgo_t algo, /* algorithm selected, supported now 0 and 1 */
+    cudnnCTCLossDescriptor_t ctcLossDesc,
+    const cudnnTensorDescriptor_t probsDesc, /* Tensor descriptor for probabilities, the dimensions are T,N,A (T is the
+                                                timing steps, N is the mini batch size, A is the alphabet size) */
+    const cudnnTensorDescriptor_t gradientsDesc, /* Tensor descriptor for gradients, the
+                                                    dimensions are T,N,A. To compute costs
+                                                    only, set it to NULL */
+    size_t *sizeInBytes);                        /* pointer to the returned workspace size */
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_ADV_H_ */

.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_backend_v9.h ADDED Viewed

	@@ -0,0 +1,60 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#ifndef _CUDNN_BACKEND_H_
+#define _CUDNN_BACKEND_H_
+/*
+ * The content of this header has been moved into cudnn_graph.h.
+ * This header is kept for the backward compatibility purpose.
+ */
+#include "cudnn_graph.h"
+#endif /* _CUDNN_BACKEND_H_ */

.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_graph.h ADDED Viewed

	@@ -0,0 +1,909 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ *  cudnn_graph : cuDNN's basic definitions operations.
+ */
+#if !defined(CUDNN_GRAPH_H_)
+#define CUDNN_GRAPH_H_
+#include <cuda_runtime_api.h>
+#include <library_types.h>
+#include <stdint.h>
+#include "cudnn_version.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_GRAPH_MAJOR 9
+#define CUDNN_GRAPH_MINOR 1
+#define CUDNN_GRAPH_PATCH 0
+#if (CUDNN_GRAPH_MAJOR != CUDNN_MAJOR) || (CUDNN_GRAPH_MINOR != CUDNN_MINOR) || (CUDNN_GRAPH_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN GRAPH!!!
+#endif
+#ifndef CUDNNWINAPI
+#ifdef _WIN32
+#define CUDNNWINAPI __stdcall
+#else
+#define CUDNNWINAPI
+#endif
+#endif
+/* Warnings for deprecated API-s are enabled using the CUDNN_WARN_DEPRECATED macro */
+#if defined(CUDNN_WARN_DEPRECATED) && (defined(__GNUC__) || defined(__clang__))
+/* GCC, Intel C/C++, Cray C/C++, CLANG, IBM XL C/C++ little endian */
+#define CUDNN_DEPRECATED __attribute__((deprecated))
+#define CUDNN_DEPRECATED_ENUM __attribute__((deprecated))
+#elif defined(CUDNN_WARN_DEPRECATED) && defined(_MSC_VER)
+/* Microsoft Visual C++ */
+#define CUDNN_DEPRECATED __declspec(deprecated)
+#define CUDNN_DEPRECATED_ENUM __declspec(deprecated)
+#elif defined(CUDNN_WARN_DEPRECATED) && (__cplusplus >= 201402L)
+/* C++14 compilers */
+#define CUDNN_DEPRECATED [[deprecated]]
+#define CUDNN_DEPRECATED_ENUM [[deprecated]]
+#else
+/* No support for the deprecated attribute */
+#define CUDNN_DEPRECATED
+#define CUDNN_DEPRECATED_ENUM
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+struct cudnnContext;
+typedef struct cudnnContext *cudnnHandle_t;
+size_t CUDNNWINAPI
+cudnnGetVersion(void);
+size_t CUDNNWINAPI
+cudnnGetMaxDeviceVersion(void);
+/* Returns CUDA Runtime version statically linked against cudnn */
+size_t CUDNNWINAPI
+cudnnGetCudartVersion(void);
+/*
+ * CUDNN return codes
+ */
+typedef enum {
+    CUDNN_STATUS_SUCCESS = 0,
+    /* Uncategorized errors */
+    CUDNN_STATUS_NOT_INITIALIZED                = 1001,
+    CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH    = 1002,
+    CUDNN_STATUS_SERIALIZATION_VERSION_MISMATCH = 1003,
+    CUDNN_STATUS_DEPRECATED                     = 1004,
+    CUDNN_STATUS_LICENSE_ERROR                  = 1005,
+    CUDNN_STATUS_RUNTIME_IN_PROGRESS            = 1006,
+    CUDNN_STATUS_RUNTIME_FP_OVERFLOW            = 1007,
+    CUDNN_STATUS_BAD_PARAM                    = 2000,
+    CUDNN_STATUS_BAD_PARAM_NULL_POINTER       = 2002,
+    CUDNN_STATUS_BAD_PARAM_MISALIGNED_POINTER = 2003,
+    CUDNN_STATUS_BAD_PARAM_NOT_FINALIZED      = 2004,
+    CUDNN_STATUS_BAD_PARAM_OUT_OF_BOUND       = 2005,
+    CUDNN_STATUS_BAD_PARAM_SIZE_INSUFFICIENT  = 2006,
+    CUDNN_STATUS_BAD_PARAM_STREAM_MISMATCH    = 2007,
+    CUDNN_STATUS_BAD_PARAM_SHAPE_MISMATCH     = 2008,
+    CUDNN_STATUS_BAD_PARAM_DUPLICATED_ENTRIES = 2009,
+    CUDNN_STATUS_BAD_PARAM_ATTRIBUTE_TYPE     = 2010,
+    CUDNN_STATUS_NOT_SUPPORTED                              = 3000,
+    CUDNN_STATUS_NOT_SUPPORTED_GRAPH_PATTERN                = 3001,
+    CUDNN_STATUS_NOT_SUPPORTED_SHAPE                        = 3002,
+    CUDNN_STATUS_NOT_SUPPORTED_DATA_TYPE                    = 3003,
+    CUDNN_STATUS_NOT_SUPPORTED_LAYOUT                       = 3004,
+    CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDA_DRIVER     = 3005,
+    CUDNN_STATUS_NOT_SUPPORTED_INCOMPATIBLE_CUDART          = 3006,
+    CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH                = 3007,
+    CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING = 3008,
+    CUDNN_STATUS_NOT_SUPPORTED_SUBLIBRARY_UNAVAILABLE       = 3009,
+    CUDNN_STATUS_NOT_SUPPORTED_SHARED_MEMORY_INSUFFICIENT   = 3010,
+    CUDNN_STATUS_NOT_SUPPORTED_PADDING                      = 3011,
+    CUDNN_STATUS_NOT_SUPPORTED_BAD_LAUNCH_PARAM             = 3012,
+    CUDNN_STATUS_INTERNAL_ERROR                          = 4000,
+    CUDNN_STATUS_INTERNAL_ERROR_COMPILATION_FAILED       = 4001,
+    CUDNN_STATUS_INTERNAL_ERROR_UNEXPECTED_VALUE         = 4002,
+    CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED   = 4003,
+    CUDNN_STATUS_INTERNAL_ERROR_DEVICE_ALLOCATION_FAILED = 4004,
+    CUDNN_STATUS_INTERNAL_ERROR_BAD_LAUNCH_PARAM         = 4005,
+    CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED  = 4006,
+    CUDNN_STATUS_EXECUTION_FAILED             = 5000,
+    CUDNN_STATUS_EXECUTION_FAILED_CUDA_DRIVER = 5001,
+    CUDNN_STATUS_EXECUTION_FAILED_CUBLAS      = 5002,
+    CUDNN_STATUS_EXECUTION_FAILED_CUDART      = 5003,
+    CUDNN_STATUS_EXECUTION_FAILED_CURAND      = 5004,
+    CUDNN_STATUS_ALLOC_FAILED CUDNN_DEPRECATED_ENUM  = CUDNN_STATUS_INTERNAL_ERROR_HOST_ALLOCATION_FAILED,
+    CUDNN_STATUS_INVALID_VALUE CUDNN_DEPRECATED_ENUM = 2001 /* please transition to CUDNN_STATUS_BAD_PARAM instead */,
+    CUDNN_STATUS_ARCH_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_NOT_SUPPORTED_ARCH_MISMATCH,
+    CUDNN_STATUS_MAPPING_ERROR CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_INTERNAL_ERROR_TEXTURE_CREATION_FAILED,
+    CUDNN_STATUS_RUNTIME_PREREQUISITE_MISSING CUDNN_DEPRECATED_ENUM =
+        CUDNN_STATUS_NOT_SUPPORTED_RUNTIME_PREREQUISITE_MISSING,
+    CUDNN_STATUS_VERSION_MISMATCH CUDNN_DEPRECATED_ENUM = CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH,
+} cudnnStatus_t;
+#define CUDNN_STATUS_FULL_ERROR_CODE(category, specific_err) ((cudnnStatus_t)(0 + (category) + (specific_err)))
+#define CUDNN_STATUS_CATEGORY(full_error_code) ((full_error_code) / 1000 * 1000)
+#define CUDNN_STATUS_SPECIFIC_ERROR(full_error_code) ((full_error_code) % 1000)
+/* human-readable error messages */
+const char *CUDNNWINAPI
+cudnnGetErrorString(cudnnStatus_t status);
+void CUDNNWINAPI
+cudnnGetLastErrorString(char *message, size_t max_size);
+/* Forward definition in this version only */
+typedef struct cudnnRuntimeTag_t cudnnRuntimeTag_t CUDNN_DEPRECATED;
+typedef enum {
+    CUDNN_ERRQUERY_RAWCODE     = 0,
+    CUDNN_ERRQUERY_NONBLOCKING = 1,
+    CUDNN_ERRQUERY_BLOCKING    = 2,
+} cudnnErrQueryMode_t;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnQueryRuntimeError(cudnnHandle_t handle, cudnnStatus_t *rstatus, cudnnErrQueryMode_t mode, cudnnRuntimeTag_t *tag);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetProperty(libraryPropertyType type, int *value);
+cudnnStatus_t CUDNNWINAPI
+cudnnCreate(cudnnHandle_t *handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroy(cudnnHandle_t handle);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetStream(cudnnHandle_t handle, cudaStream_t streamId);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetStream(cudnnHandle_t handle, cudaStream_t *streamId);
+/*
+ * CUDNN data type
+ */
+typedef enum {
+    CUDNN_DATA_FLOAT                         = 0,
+    CUDNN_DATA_DOUBLE                        = 1,
+    CUDNN_DATA_HALF                          = 2,
+    CUDNN_DATA_INT8                          = 3,
+    CUDNN_DATA_INT32                         = 4,
+    CUDNN_DATA_INT8x4 CUDNN_DEPRECATED_ENUM  = 5,
+    CUDNN_DATA_UINT8                         = 6,
+    CUDNN_DATA_UINT8x4 CUDNN_DEPRECATED_ENUM = 7,
+    CUDNN_DATA_INT8x32 CUDNN_DEPRECATED_ENUM = 8,
+    CUDNN_DATA_BFLOAT16                      = 9,
+    CUDNN_DATA_INT64                         = 10,
+    CUDNN_DATA_BOOLEAN                       = 11,
+    CUDNN_DATA_FP8_E4M3                      = 12,
+    CUDNN_DATA_FP8_E5M2                      = 13,
+    CUDNN_DATA_FAST_FLOAT_FOR_FP8            = 14,
+} cudnnDataType_t;
+/*
+ * CUDNN math type
+ */
+typedef enum {
+    CUDNN_DEFAULT_MATH                    = 0,
+    CUDNN_TENSOR_OP_MATH                  = 1,
+    CUDNN_TENSOR_OP_MATH_ALLOW_CONVERSION = 2,
+    CUDNN_FMA_MATH                        = 3,
+} cudnnMathType_t;
+/*
+ * CUDNN propagate Nan
+ */
+typedef enum {
+    CUDNN_NOT_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM = 0,
+    CUDNN_PROPAGATE_NAN CUDNN_DEPRECATED_ENUM     = 1,
+} cudnnNanPropagation_t;
+/*
+ * Behavior for OOB samples. OOB samples are samples where L+R > T is encountered during the gradient calculation. If
+ * gradMode is set to CUDNN_CTC_SKIP_OOB_GRADIENTS, then the CTC loss function does not write to the gradient buffer for
+ * that sample. Instead, the current values, even not finite, are retained. If gradMode is set to
+ * CUDNN_CTC_ZERO_OOB_GRADIENTS, then the gradient for that sample is set to zero. This guarantees a finite gradient.
+*/
+typedef enum {
+    CUDNN_CTC_ZERO_OOB_GRADIENTS = 0,
+    CUDNN_CTC_SKIP_OOB_GRADIENTS = 1,
+} cudnnCTCGradMode_t;
+typedef enum {
+    CUDNN_TENSOR_NCHW        = 0, /* row major (wStride = 1, hStride = w) */
+    CUDNN_TENSOR_NHWC        = 1, /* feature maps interleaved ( cStride = 1 )*/
+    CUDNN_TENSOR_NCHW_VECT_C = 2, /* each image point is vector of element of C, vector length in data type */
+} cudnnTensorFormat_t;
+/*
+ * CUDNN ReduceTensor op type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_ADD          = 0,
+    CUDNN_REDUCE_TENSOR_MUL          = 1,
+    CUDNN_REDUCE_TENSOR_MIN          = 2,
+    CUDNN_REDUCE_TENSOR_MAX          = 3,
+    CUDNN_REDUCE_TENSOR_AMAX         = 4,
+    CUDNN_REDUCE_TENSOR_AVG          = 5,
+    CUDNN_REDUCE_TENSOR_NORM1        = 6,
+    CUDNN_REDUCE_TENSOR_NORM2        = 7,
+    CUDNN_REDUCE_TENSOR_MUL_NO_ZEROS = 8,
+} cudnnReduceTensorOp_t;
+/*
+ * activation mode
+ */
+typedef enum {
+    CUDNN_ACTIVATION_SIGMOID      = 0,
+    CUDNN_ACTIVATION_RELU         = 1,
+    CUDNN_ACTIVATION_TANH         = 2,
+    CUDNN_ACTIVATION_CLIPPED_RELU = 3,
+    CUDNN_ACTIVATION_ELU          = 4,
+    CUDNN_ACTIVATION_IDENTITY     = 5,
+    CUDNN_ACTIVATION_SWISH        = 6
+} cudnnActivationMode_t CUDNN_DEPRECATED;
+typedef enum {
+    CUDNN_SEV_FATAL   = 0,
+    CUDNN_SEV_ERROR   = 1,
+    CUDNN_SEV_WARNING = 2,
+    CUDNN_SEV_INFO    = 3,
+} cudnnSeverity_t;
+/* Message masks to be used with cudnnSetCallback() */
+#define CUDNN_SEV_ERROR_EN (1U << CUDNN_SEV_ERROR)
+#define CUDNN_SEV_WARNING_EN (1U << CUDNN_SEV_WARNING)
+#define CUDNN_SEV_INFO_EN (1U << CUDNN_SEV_INFO)
+/* struct containing useful informaiton for each API call */
+typedef struct cudnnDebugStruct {
+    unsigned cudnn_version;
+    cudnnStatus_t cudnnStatus;
+    unsigned time_sec;      /* epoch time in seconds */
+    unsigned time_usec;     /* microseconds part of epoch time */
+    unsigned time_delta;    /* time since start in seconds */
+    cudnnHandle_t handle;   /* cudnn handle */
+    cudaStream_t stream;    /* cuda stream ID */
+    unsigned long long pid; /* process ID */
+    unsigned long long tid; /* thread ID */
+    int cudaDeviceId;       /* CUDA device ID */
+    int reserved[15];       /* reserved for future use */
+} cudnnDebug_t;
+typedef void (*cudnnCallback_t)(cudnnSeverity_t sev, void *udata, const cudnnDebug_t *dbg, const char *msg);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetCallback(unsigned mask, void *udata, cudnnCallback_t fptr);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetCallback(unsigned *mask, void **udata, cudnnCallback_t *fptr);
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGraphVersionCheck(void);
+/* Maximum supported number of tensor dimensions */
+#define CUDNN_DIM_MAX 8
+/*
+ *  convolution mode
+ */
+typedef enum { CUDNN_CONVOLUTION = 0, CUDNN_CROSS_CORRELATION = 1 } cudnnConvolutionMode_t;
+/*
+ * CUDNN Reorder
+ */
+typedef enum {
+    CUDNN_DEFAULT_REORDER = 0,
+    CUDNN_NO_REORDER      = 1,
+} cudnnReorderType_t CUDNN_DEPRECATED;
+typedef void *cudnnBackendDescriptor_t;
+typedef struct cudnnFractionStruct {
+    int64_t numerator;
+    int64_t denominator;
+} cudnnFraction_t;
+typedef enum {
+    CUDNN_POINTWISE_ADD        = 0,
+    CUDNN_POINTWISE_ADD_SQUARE = 5,
+    CUDNN_POINTWISE_DIV        = 6,
+    CUDNN_POINTWISE_MAX        = 3,
+    CUDNN_POINTWISE_MIN        = 2,
+    CUDNN_POINTWISE_MOD        = 7,
+    CUDNN_POINTWISE_MUL        = 1,
+    CUDNN_POINTWISE_POW        = 8,
+    CUDNN_POINTWISE_SUB        = 9,
+    CUDNN_POINTWISE_ABS        = 10,
+    CUDNN_POINTWISE_CEIL       = 11,
+    CUDNN_POINTWISE_COS        = 12,
+    CUDNN_POINTWISE_EXP        = 13,
+    CUDNN_POINTWISE_FLOOR      = 14,
+    CUDNN_POINTWISE_LOG        = 15,
+    CUDNN_POINTWISE_NEG        = 16,
+    CUDNN_POINTWISE_RSQRT      = 17,
+    CUDNN_POINTWISE_SIN        = 18,
+    CUDNN_POINTWISE_SQRT       = 4,
+    CUDNN_POINTWISE_TAN        = 19,
+    CUDNN_POINTWISE_ERF        = 20,
+    CUDNN_POINTWISE_IDENTITY   = 21,
+    CUDNN_POINTWISE_RECIPROCAL = 22,
+    CUDNN_POINTWISE_ATAN2      = 23,
+    CUDNN_POINTWISE_RELU_FWD             = 100,
+    CUDNN_POINTWISE_TANH_FWD             = 101,
+    CUDNN_POINTWISE_SIGMOID_FWD          = 102,
+    CUDNN_POINTWISE_ELU_FWD              = 103,
+    CUDNN_POINTWISE_GELU_FWD             = 104,
+    CUDNN_POINTWISE_SOFTPLUS_FWD         = 105,
+    CUDNN_POINTWISE_SWISH_FWD            = 106,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_FWD = 107,
+    CUDNN_POINTWISE_RELU_BWD             = 200,
+    CUDNN_POINTWISE_TANH_BWD             = 201,
+    CUDNN_POINTWISE_SIGMOID_BWD          = 202,
+    CUDNN_POINTWISE_ELU_BWD              = 203,
+    CUDNN_POINTWISE_GELU_BWD             = 204,
+    CUDNN_POINTWISE_SOFTPLUS_BWD         = 205,
+    CUDNN_POINTWISE_SWISH_BWD            = 206,
+    CUDNN_POINTWISE_GELU_APPROX_TANH_BWD = 207,
+    CUDNN_POINTWISE_CMP_EQ  = 300,
+    CUDNN_POINTWISE_CMP_NEQ = 301,
+    CUDNN_POINTWISE_CMP_GT  = 302,
+    CUDNN_POINTWISE_CMP_GE  = 303,
+    CUDNN_POINTWISE_CMP_LT  = 304,
+    CUDNN_POINTWISE_CMP_LE  = 305,
+    CUDNN_POINTWISE_LOGICAL_AND = 400,
+    CUDNN_POINTWISE_LOGICAL_OR  = 401,
+    CUDNN_POINTWISE_LOGICAL_NOT = 402,
+    CUDNN_POINTWISE_GEN_INDEX = 501,
+    CUDNN_POINTWISE_BINARY_SELECT = 601,
+} cudnnPointwiseMode_t;
+typedef enum {
+    CUDNN_RESAMPLE_NEAREST                 = 0,
+    CUDNN_RESAMPLE_BILINEAR                = 1,
+    CUDNN_RESAMPLE_AVGPOOL                 = 2,
+    CUDNN_RESAMPLE_AVGPOOL_INCLUDE_PADDING = 2,
+    CUDNN_RESAMPLE_AVGPOOL_EXCLUDE_PADDING = 4,
+    CUDNN_RESAMPLE_MAXPOOL                 = 3,
+} cudnnResampleMode_t;
+typedef enum {
+    CUDNN_SIGNAL_SET  = 0,
+    CUDNN_SIGNAL_WAIT = 1,
+} cudnnSignalMode_t;
+typedef enum {
+    CUDNN_GENSTATS_SUM_SQSUM = 0,
+} cudnnGenStatsMode_t;
+typedef enum {
+    CUDNN_BN_FINALIZE_STATISTICS_TRAINING  = 0,
+    CUDNN_BN_FINALIZE_STATISTICS_INFERENCE = 1,
+} cudnnBnFinalizeStatsMode_t;
+typedef enum {
+    CUDNN_RNG_DISTRIBUTION_BERNOULLI,
+    CUDNN_RNG_DISTRIBUTION_UNIFORM,
+    CUDNN_RNG_DISTRIBUTION_NORMAL,
+} cudnnRngDistribution_t;
+typedef enum {
+    CUDNN_ATTR_POINTWISE_MODE                                  = 0,
+    CUDNN_ATTR_POINTWISE_MATH_PREC                             = 1,
+    CUDNN_ATTR_POINTWISE_NAN_PROPAGATION CUDNN_DEPRECATED_ENUM = 2,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP                       = 3,
+    CUDNN_ATTR_POINTWISE_RELU_UPPER_CLIP                       = 4,
+    CUDNN_ATTR_POINTWISE_RELU_LOWER_CLIP_SLOPE                 = 5,
+    CUDNN_ATTR_POINTWISE_ELU_ALPHA                             = 6,
+    CUDNN_ATTR_POINTWISE_SOFTPLUS_BETA                         = 7,
+    CUDNN_ATTR_POINTWISE_SWISH_BETA                            = 8,
+    CUDNN_ATTR_POINTWISE_AXIS                                  = 9,
+    CUDNN_ATTR_CONVOLUTION_COMP_TYPE      = 100,
+    CUDNN_ATTR_CONVOLUTION_CONV_MODE      = 101,
+    CUDNN_ATTR_CONVOLUTION_DILATIONS      = 102,
+    CUDNN_ATTR_CONVOLUTION_FILTER_STRIDES = 103,
+    CUDNN_ATTR_CONVOLUTION_POST_PADDINGS  = 104,
+    CUDNN_ATTR_CONVOLUTION_PRE_PADDINGS   = 105,
+    CUDNN_ATTR_CONVOLUTION_SPATIAL_DIMS   = 106,
+    CUDNN_ATTR_ENGINEHEUR_MODE            = 200,
+    CUDNN_ATTR_ENGINEHEUR_OPERATION_GRAPH = 201,
+    CUDNN_ATTR_ENGINEHEUR_RESULTS         = 202,
+    CUDNN_ATTR_ENGINEHEUR_SM_COUNT_TARGET = 203,
+    CUDNN_ATTR_ENGINECFG_ENGINE            = 300,
+    CUDNN_ATTR_ENGINECFG_INTERMEDIATE_INFO = 301,
+    CUDNN_ATTR_ENGINECFG_KNOB_CHOICES      = 302,
+    CUDNN_ATTR_EXECUTION_PLAN_HANDLE                     = 400,
+    CUDNN_ATTR_EXECUTION_PLAN_ENGINE_CONFIG              = 401,
+    CUDNN_ATTR_EXECUTION_PLAN_WORKSPACE_SIZE             = 402,
+    CUDNN_ATTR_EXECUTION_PLAN_COMPUTED_INTERMEDIATE_UIDS = 403,
+    CUDNN_ATTR_EXECUTION_PLAN_RUN_ONLY_INTERMEDIATE_UIDS = 404,
+    CUDNN_ATTR_EXECUTION_PLAN_JSON_REPRESENTATION        = 405,
+    CUDNN_ATTR_INTERMEDIATE_INFO_UNIQUE_ID            = 500,
+    CUDNN_ATTR_INTERMEDIATE_INFO_SIZE                 = 501,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_DATA_UIDS  = 502,
+    CUDNN_ATTR_INTERMEDIATE_INFO_DEPENDENT_ATTRIBUTES = 503,
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_TYPE  = 600,
+    CUDNN_ATTR_KNOB_CHOICE_KNOB_VALUE = 601,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_ALPHA        = 700,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_BETA         = 701,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_CONV_DESC    = 702,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_W            = 703,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_X            = 704,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_FORWARD_Y            = 705,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_ALPHA       = 706,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_BETA        = 707,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_CONV_DESC   = 708,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_W           = 709,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DX          = 710,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_DATA_DY          = 711,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_ALPHA     = 712,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_BETA      = 713,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_CONV_DESC = 714,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DW        = 715,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_X         = 716,
+    CUDNN_ATTR_OPERATION_CONVOLUTION_BWD_FILTER_DY        = 717,
+    CUDNN_ATTR_OPERATION_POINTWISE_PW_DESCRIPTOR = 750,
+    CUDNN_ATTR_OPERATION_POINTWISE_XDESC         = 751,
+    CUDNN_ATTR_OPERATION_POINTWISE_BDESC         = 752,
+    CUDNN_ATTR_OPERATION_POINTWISE_YDESC         = 753,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA1        = 754,
+    CUDNN_ATTR_OPERATION_POINTWISE_ALPHA2        = 755,
+    CUDNN_ATTR_OPERATION_POINTWISE_DXDESC        = 756,
+    CUDNN_ATTR_OPERATION_POINTWISE_DYDESC        = 757,
+    CUDNN_ATTR_OPERATION_POINTWISE_TDESC         = 758,
+    CUDNN_ATTR_OPERATION_GENSTATS_MODE      = 770,
+    CUDNN_ATTR_OPERATION_GENSTATS_MATH_PREC = 771,
+    CUDNN_ATTR_OPERATION_GENSTATS_XDESC     = 772,
+    CUDNN_ATTR_OPERATION_GENSTATS_SUMDESC   = 773,
+    CUDNN_ATTR_OPERATION_GENSTATS_SQSUMDESC = 774,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_STATS_MODE                = 780,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_MATH_PREC                 = 781,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SUM_DESC                = 782,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_Y_SQ_SUM_DESC             = 783,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SCALE_DESC                = 784,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_BIAS_DESC                 = 785,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_MEAN_DESC    = 786,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_PREV_RUNNING_VAR_DESC     = 787,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_MEAN_DESC = 788,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_UPDATED_RUNNING_VAR_DESC  = 789,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_MEAN_DESC           = 790,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_SAVED_INV_STD_DESC        = 791,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_SCALE_DESC             = 792,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EQ_BIAS_DESC              = 793,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_ACCUM_COUNT_DESC          = 794,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EPSILON_DESC              = 795,
+    CUDNN_ATTR_OPERATION_BN_FINALIZE_EXP_AVERATE_FACTOR_DESC   = 796,
+    CUDNN_ATTR_OPERATIONGRAPH_HANDLE              = 800,
+    CUDNN_ATTR_OPERATIONGRAPH_OPS                 = 801,
+    CUDNN_ATTR_OPERATIONGRAPH_ENGINE_GLOBAL_COUNT = 802,
+    CUDNN_ATTR_TENSOR_BYTE_ALIGNMENT       = 900,
+    CUDNN_ATTR_TENSOR_DATA_TYPE            = 901,
+    CUDNN_ATTR_TENSOR_DIMENSIONS           = 902,
+    CUDNN_ATTR_TENSOR_STRIDES              = 903,
+    CUDNN_ATTR_TENSOR_VECTOR_COUNT         = 904,
+    CUDNN_ATTR_TENSOR_VECTORIZED_DIMENSION = 905,
+    CUDNN_ATTR_TENSOR_UNIQUE_ID            = 906,
+    CUDNN_ATTR_TENSOR_IS_VIRTUAL           = 907,
+    CUDNN_ATTR_TENSOR_IS_BY_VALUE          = 908,
+    CUDNN_ATTR_TENSOR_REORDERING_MODE      = 909,
+    CUDNN_ATTR_TENSOR_RAGGED_OFFSET_DESC   = 913,
+    CUDNN_ATTR_VARIANT_PACK_UNIQUE_IDS    = 1000,
+    CUDNN_ATTR_VARIANT_PACK_DATA_POINTERS = 1001,
+    CUDNN_ATTR_VARIANT_PACK_INTERMEDIATES = 1002,
+    CUDNN_ATTR_VARIANT_PACK_WORKSPACE     = 1003,
+    CUDNN_ATTR_LAYOUT_INFO_TENSOR_UID = 1100,
+    CUDNN_ATTR_LAYOUT_INFO_TYPES      = 1101,
+    CUDNN_ATTR_KNOB_INFO_TYPE          = 1200,
+    CUDNN_ATTR_KNOB_INFO_MAXIMUM_VALUE = 1201,
+    CUDNN_ATTR_KNOB_INFO_MINIMUM_VALUE = 1202,
+    CUDNN_ATTR_KNOB_INFO_STRIDE        = 1203,
+    CUDNN_ATTR_ENGINE_OPERATION_GRAPH = 1300,
+    CUDNN_ATTR_ENGINE_GLOBAL_INDEX    = 1301,
+    CUDNN_ATTR_ENGINE_KNOB_INFO       = 1302,
+    CUDNN_ATTR_ENGINE_NUMERICAL_NOTE  = 1303,
+    CUDNN_ATTR_ENGINE_LAYOUT_INFO     = 1304,
+    CUDNN_ATTR_ENGINE_BEHAVIOR_NOTE   = 1305,
+    CUDNN_ATTR_ENGINE_SM_COUNT_TARGET = 1306,
+    CUDNN_ATTR_MATMUL_COMP_TYPE     = 1500,
+    CUDNN_ATTR_MATMUL_PADDING_VALUE = 1503,
+    CUDNN_ATTR_OPERATION_MATMUL_ADESC                                                 = 1520,
+    CUDNN_ATTR_OPERATION_MATMUL_BDESC                                                 = 1521,
+    CUDNN_ATTR_OPERATION_MATMUL_CDESC                                                 = 1522,
+    CUDNN_ATTR_OPERATION_MATMUL_DESC                                                  = 1523,
+    CUDNN_ATTR_OPERATION_MATMUL_IRREGULARLY_STRIDED_BATCH_COUNT CUDNN_DEPRECATED_ENUM = 1524,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_M_OVERRIDE_DESC                                  = 1525,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_N_OVERRIDE_DESC                                  = 1526,
+    CUDNN_ATTR_OPERATION_MATMUL_GEMM_K_OVERRIDE_DESC                                  = 1527,
+    CUDNN_ATTR_REDUCTION_OPERATOR  = 1600,
+    CUDNN_ATTR_REDUCTION_COMP_TYPE = 1601,
+    CUDNN_ATTR_OPERATION_REDUCTION_XDESC = 1610,
+    CUDNN_ATTR_OPERATION_REDUCTION_YDESC = 1611,
+    CUDNN_ATTR_OPERATION_REDUCTION_DESC  = 1612,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MATH_PREC        = 1620,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_MEAN_DESC        = 1621,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_INVSTD_DESC      = 1622,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_BN_SCALE_DESC    = 1623,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_X_DESC           = 1624,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DY_DESC          = 1625,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_SCALE_DESC   = 1626,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_DBN_BIAS_DESC    = 1627,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_DY_SCALE_DESC = 1628,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_X_SCALE_DESC  = 1629,
+    CUDNN_ATTR_OPERATION_BN_BWD_WEIGHTS_EQ_BIAS          = 1630,
+    CUDNN_ATTR_RESAMPLE_MODE            = 1700,
+    CUDNN_ATTR_RESAMPLE_COMP_TYPE       = 1701,
+    CUDNN_ATTR_RESAMPLE_SPATIAL_DIMS    = 1702,
+    CUDNN_ATTR_RESAMPLE_POST_PADDINGS   = 1703,
+    CUDNN_ATTR_RESAMPLE_PRE_PADDINGS    = 1704,
+    CUDNN_ATTR_RESAMPLE_STRIDES         = 1705,
+    CUDNN_ATTR_RESAMPLE_WINDOW_DIMS     = 1706,
+    CUDNN_ATTR_RESAMPLE_NAN_PROPAGATION = 1707,
+    CUDNN_ATTR_RESAMPLE_PADDING_MODE    = 1708,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_XDESC                       = 1710,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_YDESC                       = 1711,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_IDXDESC                     = 1712,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_ALPHA CUDNN_DEPRECATED_ENUM = 1713,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_BETA CUDNN_DEPRECATED_ENUM  = 1714,
+    CUDNN_ATTR_OPERATION_RESAMPLE_FWD_DESC                        = 1716,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DXDESC                      = 1720,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DYDESC                      = 1721,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_IDXDESC                     = 1722,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_ALPHA CUDNN_DEPRECATED_ENUM = 1723,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_BETA CUDNN_DEPRECATED_ENUM  = 1724,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_DESC                        = 1725,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_XDESC                       = 1726,
+    CUDNN_ATTR_OPERATION_RESAMPLE_BWD_YDESC                       = 1727,
+    CUDNN_ATTR_OPERATION_CONCAT_AXIS          = 1800,
+    CUDNN_ATTR_OPERATION_CONCAT_INPUT_DESCS   = 1801,
+    CUDNN_ATTR_OPERATION_CONCAT_INPLACE_INDEX = 1802,
+    CUDNN_ATTR_OPERATION_CONCAT_OUTPUT_DESC   = 1803,
+    CUDNN_ATTR_OPERATION_SIGNAL_MODE     = 1900,
+    CUDNN_ATTR_OPERATION_SIGNAL_FLAGDESC = 1901,
+    CUDNN_ATTR_OPERATION_SIGNAL_VALUE    = 1902,
+    CUDNN_ATTR_OPERATION_SIGNAL_XDESC    = 1903,
+    CUDNN_ATTR_OPERATION_SIGNAL_YDESC    = 1904,
+    CUDNN_ATTR_OPERATION_NORM_FWD_MODE                     = 2000,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PHASE                    = 2001,
+    CUDNN_ATTR_OPERATION_NORM_FWD_XDESC                    = 2002,
+    CUDNN_ATTR_OPERATION_NORM_FWD_MEAN_DESC                = 2003,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INV_VARIANCE_DESC        = 2004,
+    CUDNN_ATTR_OPERATION_NORM_FWD_SCALE_DESC               = 2005,
+    CUDNN_ATTR_OPERATION_NORM_FWD_BIAS_DESC                = 2006,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EPSILON_DESC             = 2007,
+    CUDNN_ATTR_OPERATION_NORM_FWD_EXP_AVG_FACTOR_DESC      = 2008,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_MEAN_DESC  = 2009,
+    CUDNN_ATTR_OPERATION_NORM_FWD_INPUT_RUNNING_VAR_DESC   = 2010,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_MEAN_DESC = 2011,
+    CUDNN_ATTR_OPERATION_NORM_FWD_OUTPUT_RUNNING_VAR_DESC  = 2012,
+    CUDNN_ATTR_OPERATION_NORM_FWD_YDESC                    = 2013,
+    CUDNN_ATTR_OPERATION_NORM_FWD_PEER_STAT_DESCS          = 2014,
+    CUDNN_ATTR_OPERATION_NORM_BWD_MODE              = 2100,
+    CUDNN_ATTR_OPERATION_NORM_BWD_XDESC             = 2101,
+    CUDNN_ATTR_OPERATION_NORM_BWD_MEAN_DESC         = 2102,
+    CUDNN_ATTR_OPERATION_NORM_BWD_INV_VARIANCE_DESC = 2103,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DYDESC            = 2104,
+    CUDNN_ATTR_OPERATION_NORM_BWD_SCALE_DESC        = 2105,
+    CUDNN_ATTR_OPERATION_NORM_BWD_EPSILON_DESC      = 2106,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DSCALE_DESC       = 2107,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DBIAS_DESC        = 2108,
+    CUDNN_ATTR_OPERATION_NORM_BWD_DXDESC            = 2109,
+    CUDNN_ATTR_OPERATION_NORM_BWD_PEER_STAT_DESCS   = 2110,
+    CUDNN_ATTR_OPERATION_RESHAPE_XDESC = 2200,
+    CUDNN_ATTR_OPERATION_RESHAPE_YDESC = 2201,
+    CUDNN_ATTR_RNG_DISTRIBUTION                   = 2300,
+    CUDNN_ATTR_RNG_NORMAL_DIST_MEAN               = 2301,
+    CUDNN_ATTR_RNG_NORMAL_DIST_STANDARD_DEVIATION = 2302,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MAXIMUM           = 2303,
+    CUDNN_ATTR_RNG_UNIFORM_DIST_MINIMUM           = 2304,
+    CUDNN_ATTR_RNG_BERNOULLI_DIST_PROBABILITY     = 2305,
+    CUDNN_ATTR_OPERATION_RNG_YDESC       = 2310,
+    CUDNN_ATTR_OPERATION_RNG_SEED        = 2311,
+    CUDNN_ATTR_OPERATION_RNG_DESC        = 2312,
+    CUDNN_ATTR_OPERATION_RNG_OFFSET_DESC = 2313,
+} cudnnBackendAttributeName_t;
+typedef enum {
+    CUDNN_TYPE_HANDLE = 0,
+    CUDNN_TYPE_DATA_TYPE,
+    CUDNN_TYPE_BOOLEAN,
+    CUDNN_TYPE_INT64,
+    CUDNN_TYPE_FLOAT,
+    CUDNN_TYPE_DOUBLE,
+    CUDNN_TYPE_VOID_PTR,
+    CUDNN_TYPE_CONVOLUTION_MODE,
+    CUDNN_TYPE_HEUR_MODE,
+    CUDNN_TYPE_KNOB_TYPE,
+    CUDNN_TYPE_NAN_PROPOGATION CUDNN_DEPRECATED_ENUM,
+    CUDNN_TYPE_NUMERICAL_NOTE,
+    CUDNN_TYPE_LAYOUT_TYPE,
+    CUDNN_TYPE_ATTRIB_NAME,
+    CUDNN_TYPE_POINTWISE_MODE,
+    CUDNN_TYPE_BACKEND_DESCRIPTOR,
+    CUDNN_TYPE_GENSTATS_MODE,
+    CUDNN_TYPE_BN_FINALIZE_STATS_MODE,
+    CUDNN_TYPE_REDUCTION_OPERATOR_TYPE,
+    CUDNN_TYPE_BEHAVIOR_NOTE,
+    CUDNN_TYPE_TENSOR_REORDERING_MODE,
+    CUDNN_TYPE_RESAMPLE_MODE,
+    CUDNN_TYPE_PADDING_MODE,
+    CUDNN_TYPE_INT32,
+    CUDNN_TYPE_CHAR,
+    CUDNN_TYPE_SIGNAL_MODE,
+    CUDNN_TYPE_FRACTION,
+    CUDNN_TYPE_NORM_MODE,
+    CUDNN_TYPE_NORM_FWD_PHASE,
+    CUDNN_TYPE_RNG_DISTRIBUTION
+} cudnnBackendAttributeType_t;
+typedef enum {
+    CUDNN_BACKEND_POINTWISE_DESCRIPTOR = 0,
+    CUDNN_BACKEND_CONVOLUTION_DESCRIPTOR,
+    CUDNN_BACKEND_ENGINE_DESCRIPTOR,
+    CUDNN_BACKEND_ENGINECFG_DESCRIPTOR,
+    CUDNN_BACKEND_ENGINEHEUR_DESCRIPTOR,
+    CUDNN_BACKEND_EXECUTION_PLAN_DESCRIPTOR,
+    CUDNN_BACKEND_INTERMEDIATE_INFO_DESCRIPTOR,
+    CUDNN_BACKEND_KNOB_CHOICE_DESCRIPTOR,
+    CUDNN_BACKEND_KNOB_INFO_DESCRIPTOR,
+    CUDNN_BACKEND_LAYOUT_INFO_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_GEN_STATS_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATIONGRAPH_DESCRIPTOR,
+    CUDNN_BACKEND_VARIANT_PACK_DESCRIPTOR,
+    CUDNN_BACKEND_TENSOR_DESCRIPTOR,
+    CUDNN_BACKEND_MATMUL_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_MATMUL_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_BN_FINALIZE_STATISTICS_DESCRIPTOR,
+    CUDNN_BACKEND_REDUCTION_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_REDUCTION_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_BN_BWD_WEIGHTS_DESCRIPTOR,
+    CUDNN_BACKEND_RESAMPLE_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_FWD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RESAMPLE_BWD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_CONCAT_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_SIGNAL_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RESHAPE_DESCRIPTOR,
+    CUDNN_BACKEND_RNG_DESCRIPTOR,
+    CUDNN_BACKEND_OPERATION_RNG_DESCRIPTOR,
+} cudnnBackendDescriptorType_t;
+typedef enum {
+    CUDNN_NUMERICAL_NOTE_TENSOR_CORE = 0,
+    CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS,
+    CUDNN_NUMERICAL_NOTE_REDUCED_PRECISION_REDUCTION,
+    CUDNN_NUMERICAL_NOTE_FFT,
+    CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_4x4,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_6x6,
+    CUDNN_NUMERICAL_NOTE_WINOGRAD_TILE_13x13,
+    CUDNN_NUMERICAL_NOTE_STRICT_NAN_PROP,
+    CUDNN_NUMERICAL_NOTE_TYPE_COUNT,
+} cudnnBackendNumericalNote_t;
+typedef enum {
+    CUDNN_BEHAVIOR_NOTE_RUNTIME_COMPILATION             = 0,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_FILTER_INT8x32_REORDER = 1,
+    CUDNN_BEHAVIOR_NOTE_REQUIRES_BIAS_INT8x32_REORDER   = 2,
+    CUDNN_BEHAVIOR_NOTE_TYPE_COUNT,
+} cudnnBackendBehaviorNote_t;
+typedef enum {
+    CUDNN_KNOB_TYPE_SPLIT_K CUDNN_DEPRECATED_ENUM          = 0,
+    CUDNN_KNOB_TYPE_SWIZZLE                                = 1,
+    CUDNN_KNOB_TYPE_TILE_SIZE                              = 2,
+    CUDNN_KNOB_TYPE_USE_TEX CUDNN_DEPRECATED_ENUM          = 3,
+    CUDNN_KNOB_TYPE_EDGE                                   = 4,
+    CUDNN_KNOB_TYPE_KBLOCK CUDNN_DEPRECATED_ENUM           = 5,
+    CUDNN_KNOB_TYPE_LDGA CUDNN_DEPRECATED_ENUM             = 6,
+    CUDNN_KNOB_TYPE_LDGB CUDNN_DEPRECATED_ENUM             = 7,
+    CUDNN_KNOB_TYPE_CHUNK_K CUDNN_DEPRECATED_ENUM          = 8,
+    CUDNN_KNOB_TYPE_SPLIT_H CUDNN_DEPRECATED_ENUM          = 9,
+    CUDNN_KNOB_TYPE_WINO_TILE CUDNN_DEPRECATED_ENUM        = 10,
+    CUDNN_KNOB_TYPE_MULTIPLY                               = 11,
+    CUDNN_KNOB_TYPE_SPLIT_K_BUF                            = 12,
+    CUDNN_KNOB_TYPE_TILEK                                  = 13,
+    CUDNN_KNOB_TYPE_STAGES                                 = 14,
+    CUDNN_KNOB_TYPE_REDUCTION_MODE                         = 15,
+    CUDNN_KNOB_TYPE_CTA_SPLIT_K_MODE CUDNN_DEPRECATED_ENUM = 16,
+    CUDNN_KNOB_TYPE_SPLIT_K_SLC                            = 17,
+    CUDNN_KNOB_TYPE_IDX_MODE CUDNN_DEPRECATED_ENUM         = 18,
+    CUDNN_KNOB_TYPE_SLICED CUDNN_DEPRECATED_ENUM           = 19,
+    CUDNN_KNOB_TYPE_SPLIT_RS CUDNN_DEPRECATED_ENUM         = 20,
+    CUDNN_KNOB_TYPE_SINGLEBUFFER CUDNN_DEPRECATED_ENUM     = 21,
+    CUDNN_KNOB_TYPE_LDGC CUDNN_DEPRECATED_ENUM             = 22,
+    CUDNN_KNOB_TYPE_SPECFILT                               = 23,
+    CUDNN_KNOB_TYPE_KERNEL_CFG                             = 24,
+    CUDNN_KNOB_TYPE_WORKSPACE                              = 25,
+    CUDNN_KNOB_TYPE_TILE_CGA CUDNN_DEPRECATED_ENUM         = 26,
+    CUDNN_KNOB_TYPE_TILE_CGA_M                             = 27,
+    CUDNN_KNOB_TYPE_TILE_CGA_N                             = 28,
+    CUDNN_KNOB_TYPE_BLOCK_SIZE                             = 29,
+    CUDNN_KNOB_TYPE_OCCUPANCY                              = 30,
+    CUDNN_KNOB_TYPE_ARRAY_SIZE_PER_THREAD                  = 31,
+    CUDNN_KNOB_TYPE_NUM_C_PER_BLOCK CUDNN_DEPRECATED_ENUM  = 32,
+    CUDNN_KNOB_TYPE_SPLIT_COLS                             = 33,
+    CUDNN_KNOB_TYPE_TILE_ROWS                              = 34,
+    CUDNN_KNOB_TYPE_TILE_COLS                              = 35,
+    CUDNN_KNOB_TYPE_LOAD_SIZE                              = 36,
+    CUDNN_KNOB_TYPE_COUNTS,
+} cudnnBackendKnobType_t;
+typedef enum {
+    CUDNN_LAYOUT_TYPE_PREFERRED_NCHW   = 0,
+    CUDNN_LAYOUT_TYPE_PREFERRED_NHWC   = 1,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD4CK = 2,
+    CUDNN_LAYOUT_TYPE_PREFERRED_PAD8CK = 3,
+    CUDNN_LAYOUT_TYPE_COUNT            = 4,
+} cudnnBackendLayoutType_t;
+typedef enum {
+    CUDNN_HEUR_MODE_INSTANT  = 0,
+    CUDNN_HEUR_MODE_B        = 1,
+    CUDNN_HEUR_MODE_FALLBACK = 2,
+    CUDNN_HEUR_MODE_A        = 3,
+    CUDNN_HEUR_MODES_COUNT   = 4,
+} cudnnBackendHeurMode_t;
+typedef enum {
+    CUDNN_TENSOR_REORDERING_NONE    = 0,
+    CUDNN_TENSOR_REORDERING_INT8x32 = 1,
+    CUDNN_TENSOR_REORDERING_F16x16  = 2,
+} cudnnBackendTensorReordering_t;
+typedef enum {
+    CUDNN_ZERO_PAD     = 0,
+    CUDNN_NEG_INF_PAD  = 1,
+    CUDNN_EDGE_VAL_PAD = 2,
+} cudnnPaddingMode_t;
+typedef enum {
+    CUDNN_LAYER_NORM    = 0,
+    CUDNN_INSTANCE_NORM = 1,
+    CUDNN_BATCH_NORM    = 2,
+    CUDNN_GROUP_NORM    = 3,
+    CUDNN_RMS_NORM      = 4,
+} cudnnBackendNormMode_t;
+typedef enum {
+    CUDNN_NORM_FWD_INFERENCE = 0,
+    CUDNN_NORM_FWD_TRAINING  = 1,
+} cudnnBackendNormFwdPhase_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendCreateDescriptor(cudnnBackendDescriptorType_t descriptorType, cudnnBackendDescriptor_t *descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendDestroyDescriptor(cudnnBackendDescriptor_t descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendInitialize(cudnnBackendDescriptor_t descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendFinalize(cudnnBackendDescriptor_t descriptor);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendSetAttribute(cudnnBackendDescriptor_t descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t elementCount,
+                         const void *arrayOfElements);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendGetAttribute(cudnnBackendDescriptor_t const descriptor,
+                         cudnnBackendAttributeName_t attributeName,
+                         cudnnBackendAttributeType_t attributeType,
+                         int64_t requestedElementCount,
+                         int64_t *elementCount,
+                         void *arrayOfElements);
+cudnnStatus_t CUDNNWINAPI
+cudnnBackendExecute(cudnnHandle_t handle, cudnnBackendDescriptor_t executionPlan, cudnnBackendDescriptor_t variantPack);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_GRAPH_H_ */

.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_ops.h ADDED Viewed

	@@ -0,0 +1,1316 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/*
+ *  cudnn_ops : cuDNN's basic definitions and basic operations.
+ */
+#if !defined(CUDNN_OPS_H_)
+#define CUDNN_OPS_H_
+#include <stdint.h>
+#include "cudnn_version.h"
+#include "cudnn_graph.h"
+/* These version numbers are autogenerated, do not edit manually. */
+#define CUDNN_OPS_MAJOR 9
+#define CUDNN_OPS_MINOR 1
+#define CUDNN_OPS_PATCH 0
+#if (CUDNN_OPS_MAJOR != CUDNN_MAJOR) || (CUDNN_OPS_MINOR != CUDNN_MINOR) || (CUDNN_OPS_PATCH != CUDNN_PATCHLEVEL)
+#error Version mismatch in cuDNN OPS INFER!!!
+#endif
+#if defined(__cplusplus)
+extern "C" {
+#endif
+/* Data structures to represent Image/Filter and the Neural Network Layer */
+typedef struct cudnnTensorStruct *cudnnTensorDescriptor_t;
+typedef struct cudnnPoolingStruct *cudnnPoolingDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnFilterStruct *cudnnFilterDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnLRNStruct *cudnnLRNDescriptor_t;
+typedef struct cudnnActivationStruct *cudnnActivationDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnSpatialTransformerStruct *cudnnSpatialTransformerDescriptor_t;
+typedef struct cudnnOpTensorStruct *cudnnOpTensorDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnReduceTensorStruct *cudnnReduceTensorDescriptor_t CUDNN_DEPRECATED;
+typedef struct cudnnCTCLossStruct *cudnnCTCLossDescriptor_t;
+typedef struct cudnnTensorTransformStruct *cudnnTensorTransformDescriptor_t CUDNN_DEPRECATED;
+/*
+ * CUDNN Determinism
+ */
+typedef enum {
+    CUDNN_NON_DETERMINISTIC = 0,
+    CUDNN_DETERMINISTIC     = 1,
+} cudnnDeterminism_t;
+/* Create an instance of a generic Tensor descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorDescriptor(cudnnTensorDescriptor_t *tensorDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnTensorFormat_t format,
+                           cudnnDataType_t dataType, /* image data type */
+                           int n,                    /* number of inputs (batch size) */
+                           int c,                    /* number of input feature maps */
+                           int h,                    /* height of input section */
+                           int w);                   /* width of input section */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor4dDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnDataType_t dataType, /* image data type */
+                             int n,                    /* number of inputs (batch size) */
+                             int c,                    /* number of input feature maps */
+                             int h,                    /* height of input section */
+                             int w,                    /* width of input section */
+                             int nStride,
+                             int cStride,
+                             int hStride,
+                             int wStride);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensor4dDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           int *n,                    /* number of inputs (batch size) */
+                           int *c,                    /* number of input feature maps  */
+                           int *h,                    /* height of input section */
+                           int *w,                    /* width of input section */
+                           int *nStride,
+                           int *cStride,
+                           int *hStride,
+                           int *wStride);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptor(cudnnTensorDescriptor_t tensorDesc,
+                           cudnnDataType_t dataType,
+                           int nbDims,
+                           const int dimA[],
+                           const int strideA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorNdDescriptorEx(cudnnTensorDescriptor_t tensorDesc,
+                             cudnnTensorFormat_t format,
+                             cudnnDataType_t dataType,
+                             int nbDims,
+                             const int dimA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorNdDescriptor(const cudnnTensorDescriptor_t tensorDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType,
+                           int *nbDims,
+                           int dimA[],
+                           int strideA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorSizeInBytes(const cudnnTensorDescriptor_t tensorDesc, size_t *size);
+/* PixelOffset( n, c, h, w ) = n *input_stride + c * feature_stride + h * h_stride + w * w_stride
+   1)Example of all images in row major order one batch of features after the other (with an optional padding on row)
+   input_stride :  c x h x h_stride
+   feature_stride : h x h_stride
+   h_stride  :  >= w  ( h_stride = w if no padding)
+   w_stride  : 1
+   2)Example of all images in row major with features maps interleaved
+   input_stride :  c x h x h_stride
+   feature_stride : 1
+   h_stride  :  w x c
+   w_stride  : c
+   3)Example of all images in column major order one batch of features after the other (with optional padding on column)
+   input_stride :  c x w x w_stride
+   feature_stride : w x w_stride
+   h_stride  :  1
+   w_stride  :  >= h
+*/
+/* Destroy an instance of Tensor4d descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorDescriptor(cudnnTensorDescriptor_t tensorDesc);
+/* Fold/unfold transforms */
+typedef enum {
+    CUDNN_TRANSFORM_FOLD   = 0U,
+    CUDNN_TRANSFORM_UNFOLD = 1U,
+} cudnnFoldingDirection_t;
+/** Create a destination descriptor for cudnnTransformTensor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnInitTransformDest(const cudnnTensorTransformDescriptor_t transformDesc,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       cudnnTensorDescriptor_t destDesc,
+                       size_t *destSizeInBytes);
+/** Create an empty tensor transform descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateTensorTransformDescriptor(cudnnTensorTransformDescriptor_t *transformDesc);
+/** Initialize a previously created tensor transform descriptor. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  const uint32_t nbDims,
+                                  const cudnnTensorFormat_t destFormat,
+                                  const int32_t padBeforeA[],
+                                  const int32_t padAfterA[],
+                                  const uint32_t foldA[],
+                                  const cudnnFoldingDirection_t direction);
+/**
+ * Retrieves the values stored in a previously initialized tensor transform
+ * descriptor.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc,
+                                  uint32_t nbDimsRequested,
+                                  cudnnTensorFormat_t *destFormat,
+                                  int32_t padBeforeA[],
+                                  int32_t padAfterA[],
+                                  uint32_t foldA[],
+                                  cudnnFoldingDirection_t *direction);
+/**
+ * Destroys a previously created tensor transform descriptor.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyTensorTransformDescriptor(cudnnTensorTransformDescriptor_t transformDesc);
+/* Tensor layout conversion helper (y = alpha * x + beta * y) */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensor(cudnnHandle_t handle,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t yDesc,
+                     void *y);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformTensorEx(cudnnHandle_t handle,
+                       const cudnnTensorTransformDescriptor_t transDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t srcDesc,
+                       const void *srcData,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t destDesc,
+                       void *destData);
+/* Tensor Bias addition : C = alpha * A + beta * C  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnAddTensor(cudnnHandle_t handle,
+               const void *alpha,
+               const cudnnTensorDescriptor_t aDesc,
+               const void *A,
+               const void *beta,
+               const cudnnTensorDescriptor_t cDesc,
+               void *C);
+/*
+ * CUDNN OpTensor op type
+ */
+typedef enum {
+    CUDNN_OP_TENSOR_ADD  = 0,
+    CUDNN_OP_TENSOR_MUL  = 1,
+    CUDNN_OP_TENSOR_MIN  = 2,
+    CUDNN_OP_TENSOR_MAX  = 3,
+    CUDNN_OP_TENSOR_SQRT = 4,
+    CUDNN_OP_TENSOR_NOT  = 5,
+} cudnnOpTensorOp_t;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateOpTensorDescriptor(cudnnOpTensorDescriptor_t *opTensorDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t opTensorOp,
+                           cudnnDataType_t opTensorCompType,
+                           cudnnNanPropagation_t opTensorNanOpt);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetOpTensorDescriptor(const cudnnOpTensorDescriptor_t opTensorDesc,
+                           cudnnOpTensorOp_t *opTensorOp,
+                           cudnnDataType_t *opTensorCompType,
+                           cudnnNanPropagation_t *opTensorNanOpt);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyOpTensorDescriptor(cudnnOpTensorDescriptor_t opTensorDesc);
+/* Tensor operation : C = op( alpha1 * A, alpha2 * B ) + beta * C */
+/* B tensor is ignored for CUDNN_OP_TENSOR_SQRT, CUDNN_OP_TENSOR_NOT. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnOpTensor(cudnnHandle_t handle,
+              const cudnnOpTensorDescriptor_t opTensorDesc,
+              const void *alpha1,
+              const cudnnTensorDescriptor_t aDesc,
+              const void *A,
+              const void *alpha2,
+              const cudnnTensorDescriptor_t bDesc,
+              const void *B,
+              const void *beta,
+              const cudnnTensorDescriptor_t cDesc,
+              void *C);
+/*
+ * CUDNN ReduceTensor indices type
+ */
+typedef enum {
+    CUDNN_REDUCE_TENSOR_NO_INDICES        = 0,
+    CUDNN_REDUCE_TENSOR_FLATTENED_INDICES = 1,
+} cudnnReduceTensorIndices_t CUDNN_DEPRECATED;
+/*
+ * CUDNN tensor indices type size (all unsigned)
+ * Currently not supported, default is 32 bit unsigned.
+ */
+typedef enum {
+    CUDNN_32BIT_INDICES = 0,
+    CUDNN_64BIT_INDICES = 1,
+    CUDNN_16BIT_INDICES = 2,
+    CUDNN_8BIT_INDICES  = 3,
+} cudnnIndicesType_t CUDNN_DEPRECATED;
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateReduceTensorDescriptor(cudnnReduceTensorDescriptor_t *reduceTensorDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t reduceTensorOp,
+                               cudnnDataType_t reduceTensorCompType,
+                               cudnnNanPropagation_t reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t reduceTensorIndices,
+                               cudnnIndicesType_t reduceTensorIndicesType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReduceTensorDescriptor(const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               cudnnReduceTensorOp_t *reduceTensorOp,
+                               cudnnDataType_t *reduceTensorCompType,
+                               cudnnNanPropagation_t *reduceTensorNanOpt,
+                               cudnnReduceTensorIndices_t *reduceTensorIndices,
+                               cudnnIndicesType_t *reduceTensorIndicesType);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyReduceTensorDescriptor(cudnnReduceTensorDescriptor_t reduceTensorDesc);
+/* Helper function to return the minimum size of the index space to be passed to the reduction given the input and
+ * output tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionIndicesSize(cudnnHandle_t handle,
+                             const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                             const cudnnTensorDescriptor_t aDesc,
+                             const cudnnTensorDescriptor_t cDesc,
+                             size_t *sizeInBytes);
+/* Helper function to return the minimum size of the workspace to be passed to the reduction given the input and output
+ * tensors */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetReductionWorkspaceSize(cudnnHandle_t handle,
+                               const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                               const cudnnTensorDescriptor_t aDesc,
+                               const cudnnTensorDescriptor_t cDesc,
+                               size_t *sizeInBytes);
+/* Tensor operation : C = reduce op( alpha * A ) + beta * C */
+/* The NaN propagation enum applies to only the min and max reduce ops; the other reduce ops propagate NaN as usual. */
+/* The indices space is ignored for reduce ops other than min or max. */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnReduceTensor(cudnnHandle_t handle,
+                  const cudnnReduceTensorDescriptor_t reduceTensorDesc,
+                  void *indices,
+                  size_t indicesSizeInBytes,
+                  void *workspace,
+                  size_t workspaceSizeInBytes,
+                  const void *alpha,
+                  const cudnnTensorDescriptor_t aDesc,
+                  const void *A,
+                  const void *beta,
+                  const cudnnTensorDescriptor_t cDesc,
+                  void *C);
+/* Set all values of a tensor to a given value : y[i] = value[0] */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *valuePtr);
+/* Scale all values of a tensor by a given factor : y[i] = alpha * y[i] */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnScaleTensor(cudnnHandle_t handle, const cudnnTensorDescriptor_t yDesc, void *y, const void *alpha);
+/* Create an instance of FilterStruct */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateFilterDescriptor(cudnnFilterDescriptor_t *filterDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFilter4dDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int k,  /* number of output feature maps */
+                           int c,  /* number of input feature maps */
+                           int h,  /* height of each input filter */
+                           int w); /* width of  each input filter */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilter4dDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *k,  /* number of output feature maps */
+                           int *c,  /* number of input feature maps */
+                           int *h,  /* height of each input filter */
+                           int *w); /* width of  each input filter */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetFilterNdDescriptor(cudnnFilterDescriptor_t filterDesc,
+                           cudnnDataType_t dataType, /* image data type */
+                           cudnnTensorFormat_t format,
+                           int nbDims,
+                           const int filterDimA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterNdDescriptor(const cudnnFilterDescriptor_t filterDesc,
+                           int nbDimsRequested,
+                           cudnnDataType_t *dataType, /* image data type */
+                           cudnnTensorFormat_t *format,
+                           int *nbDims,
+                           int filterDimA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetFilterSizeInBytes(const cudnnFilterDescriptor_t filterDesc, size_t *size);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnTransformFilter(cudnnHandle_t handle,
+                     const cudnnTensorTransformDescriptor_t transDesc,
+                     const void *alpha,
+                     const cudnnFilterDescriptor_t srcDesc,
+                     const void *srcData,
+                     const void *beta,
+                     const cudnnFilterDescriptor_t destDesc,
+                     void *destData);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyFilterDescriptor(cudnnFilterDescriptor_t filterDesc);
+/*
+ *  softmax algorithm
+ */
+typedef enum {
+    CUDNN_SOFTMAX_FAST     = 0, /* straightforward implementation */
+    CUDNN_SOFTMAX_ACCURATE = 1, /* subtract max from every point to avoid overflow */
+    CUDNN_SOFTMAX_LOG      = 2
+} cudnnSoftmaxAlgorithm_t;
+typedef enum {
+    CUDNN_SOFTMAX_MODE_INSTANCE = 0, /* compute the softmax over all C, H, W for each N */
+    CUDNN_SOFTMAX_MODE_CHANNEL  = 1  /* compute the softmax over all C for each H, W, N */
+} cudnnSoftmaxMode_t;
+/* Softmax functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+/* Function to perform forward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxForward(cudnnHandle_t handle,
+                    cudnnSoftmaxAlgorithm_t algo,
+                    cudnnSoftmaxMode_t mode,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+/*
+ *  pooling mode
+ */
+typedef enum {
+    CUDNN_POOLING_MAX                           = 0,
+    CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING = 1, /* count for average includes padded values */
+    CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING = 2, /* count for average does not include padded values */
+    CUDNN_POOLING_MAX_DETERMINISTIC             = 3
+} cudnnPoolingMode_t CUDNN_DEPRECATED;
+/* Create an instance of pooling descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreatePoolingDescriptor(cudnnPoolingDescriptor_t *poolingDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPooling2dDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t mode,
+                            cudnnNanPropagation_t maxpoolingNanOpt,
+                            int windowHeight,
+                            int windowWidth,
+                            int verticalPadding,
+                            int horizontalPadding,
+                            int verticalStride,
+                            int horizontalStride);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *windowHeight,
+                            int *windowWidth,
+                            int *verticalPadding,
+                            int *horizontalPadding,
+                            int *verticalStride,
+                            int *horizontalStride);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetPoolingNdDescriptor(cudnnPoolingDescriptor_t poolingDesc,
+                            const cudnnPoolingMode_t mode,
+                            const cudnnNanPropagation_t maxpoolingNanOpt,
+                            int nbDims,
+                            const int windowDimA[],
+                            const int paddingA[],
+                            const int strideA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdDescriptor(const cudnnPoolingDescriptor_t poolingDesc,
+                            int nbDimsRequested,
+                            cudnnPoolingMode_t *mode,
+                            cudnnNanPropagation_t *maxpoolingNanOpt,
+                            int *nbDims,
+                            int windowDimA[],
+                            int paddingA[],
+                            int strideA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPoolingNdForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int nbDims,
+                                  int outputTensorDimA[]);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetPooling2dForwardOutputDim(const cudnnPoolingDescriptor_t poolingDesc,
+                                  const cudnnTensorDescriptor_t inputTensorDesc,
+                                  int *n,
+                                  int *c,
+                                  int *h,
+                                  int *w);
+/* Destroy an instance of pooling descriptor */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyPoolingDescriptor(cudnnPoolingDescriptor_t poolingDesc);
+/* Pooling functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+/* Function to perform forward pooling */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnPoolingForward(cudnnHandle_t handle,
+                    const cudnnPoolingDescriptor_t poolingDesc,
+                    const void *alpha,
+                    const cudnnTensorDescriptor_t xDesc,
+                    const void *x,
+                    const void *beta,
+                    const cudnnTensorDescriptor_t yDesc,
+                    void *y);
+/* Activation functions: All of the form "output = alpha * Op(inputs) + beta * output" */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnCreateActivationDescriptor(cudnnActivationDescriptor_t *activationDesc);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptor(cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t mode,
+                             cudnnNanPropagation_t reluNanOpt,
+                             double coef); /* ceiling for clipped RELU, alpha for ELU */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptor(const cudnnActivationDescriptor_t activationDesc,
+                             cudnnActivationMode_t *mode,
+                             cudnnNanPropagation_t *reluNanOpt,
+                             double *coef); /* ceiling for clipped RELU, alpha for ELU */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnSetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double swish_beta);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetActivationDescriptorSwishBeta(cudnnActivationDescriptor_t activationDesc, double *swish_beta);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDestroyActivationDescriptor(cudnnActivationDescriptor_t activationDesc);
+/* Function to perform forward activation  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnActivationForward(cudnnHandle_t handle,
+                       cudnnActivationDescriptor_t activationDesc,
+                       const void *alpha,
+                       const cudnnTensorDescriptor_t xDesc,
+                       const void *x,
+                       const void *beta,
+                       const cudnnTensorDescriptor_t yDesc,
+                       void *y);
+/*
+ * Create an instance of LRN (Local Response Normalization) descriptor
+ * Uses lrnN=5, lrnAlpha=1e-4, lrnBeta=0.75, lrnK=2.0 as defaults from Krizhevsky'12 ImageNet paper
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateLRNDescriptor(cudnnLRNDescriptor_t *normDesc);
+#define CUDNN_LRN_MIN_N 1       /* minimum allowed lrnN */
+#define CUDNN_LRN_MAX_N 16      /* maximum allowed lrnN */
+#define CUDNN_LRN_MIN_K 1e-5    /* minimum allowed lrnK */
+#define CUDNN_LRN_MIN_BETA 0.01 /* minimum allowed lrnBeta */
+/* LRN layer mode */
+typedef enum {
+    CUDNN_LRN_CROSS_CHANNEL_DIM1 = 0, /* Normalize across tensor's dimA[1] dimension */
+} cudnnLRNMode_t;
+/*
+ * Uses a window [center-lookBehind, center+lookAhead], where
+ * lookBehind = floor( (lrnN-1)/2 ), lookAhead = lrnN-lookBehind-1.
+ * Values of double parameters cast to tensor data type.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnSetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned lrnN, double lrnAlpha, double lrnBeta, double lrnK);
+/*
+ * Retrieve the settings currently stored in an LRN layer descriptor
+ * Any of the provided pointers can be NULL (no corresponding value will be returned)
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnGetLRNDescriptor(cudnnLRNDescriptor_t normDesc, unsigned *lrnN, double *lrnAlpha, double *lrnBeta, double *lrnK);
+/* Destroy an instance of LRN descriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyLRNDescriptor(cudnnLRNDescriptor_t lrnDesc);
+/* LRN functions: output = alpha * normalize(x) + beta * old_y */
+/* LRN cross-channel forward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelForward(cudnnHandle_t handle,
+                            cudnnLRNDescriptor_t normDesc,
+                            cudnnLRNMode_t lrnMode,
+                            const void *alpha,
+                            const cudnnTensorDescriptor_t xDesc,
+                            const void *x,
+                            const void *beta,
+                            const cudnnTensorDescriptor_t yDesc,
+                            void *y);
+typedef enum {
+    CUDNN_DIVNORM_PRECOMPUTED_MEANS = 0,
+} cudnnDivNormMode_t;
+/* LCN/divisive normalization functions: y = alpha * normalize(x) + beta * y */
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationForward(cudnnHandle_t handle,
+                                  cudnnLRNDescriptor_t normDesc,
+                                  cudnnDivNormMode_t mode,
+                                  const void *alpha,
+                                  const cudnnTensorDescriptor_t xDesc, /* same desc for means, temp, temp2 */
+                                  const void *x,
+                                  const void *means, /* if NULL, means are assumed to be zero */
+                                  void *temp,
+                                  void *temp2,
+                                  const void *beta,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *y);
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_BATCHNORM_PER_ACTIVATION = 0,
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_BATCHNORM_SPATIAL = 1,
+    /*
+     * bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors).
+     * May be faster than CUDNN_BATCHNORM_SPATIAL but imposes some limits on the range of values
+     */
+    CUDNN_BATCHNORM_SPATIAL_PERSISTENT = 2,
+} cudnnBatchNormMode_t CUDNN_DEPRECATED;
+#define CUDNN_BN_MIN_EPSILON 0.0 /* Minimum epsilon allowed to be used in the Batch Normalization formula */
+/*
+ * Derives a tensor descriptor from layer data descriptor for BatchNormalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * bnScaleBiasMeanVarDesc and bnScaleBiasDiffDesc in Batch Normalization forward and backward functions.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDeriveBNTensorDescriptor(cudnnTensorDescriptor_t derivedBnDesc,
+                              const cudnnTensorDescriptor_t xDesc,
+                              cudnnBatchNormMode_t mode);
+typedef enum {
+    CUDNN_BATCHNORM_OPS_BN                = 0, /* do batch normalization only */
+    CUDNN_BATCHNORM_OPS_BN_ACTIVATION     = 1, /* do batchNorm, then activation */
+    CUDNN_BATCHNORM_OPS_BN_ADD_ACTIVATION = 2, /* do batchNorm, then elemWiseAdd, then activation */
+} cudnnBatchNormOps_t CUDNN_DEPRECATED;
+/*
+ * Performs Batch Normalization during Inference:
+ * y[i] = bnScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + bnBias[k]
+ * with bnScale, bnBias, runningMean, runningInvVariance tensors indexed
+ * according to spatial or per-activation mode. Refer to cudnnBatchNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardInference(cudnnHandle_t handle,
+                                        cudnnBatchNormMode_t mode,
+                                        const void *alpha, /* alpha[0] = result blend factor */
+                                        const void *beta,  /* beta[0] = dest layer blend factor */
+                                        const cudnnTensorDescriptor_t xDesc,
+                                        const void *x, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t yDesc,
+                                        void *y, /* NxCxHxW */
+                                        const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                        const void *bnScale,
+                                        const void *bnBias,
+                                        const void *estimatedMean,
+                                        const void *estimatedVariance,
+                                        double epsilon);
+typedef enum {
+    /* bnScale, bnBias tensor dims are 1xCxHxWx.. (one value per CHW...-slice, normalized over N slice) */
+    CUDNN_NORM_PER_ACTIVATION = 0,
+    /* bnScale, bnBias tensor dims are 1xCx1x1 (one value per C-dim normalized over Nx1xHxW subtensors) */
+    CUDNN_NORM_PER_CHANNEL = 1,
+} cudnnNormMode_t CUDNN_DEPRECATED;
+typedef enum { CUDNN_NORM_ALGO_STANDARD = 0, CUDNN_NORM_ALGO_PERSIST = 1 } cudnnNormAlgo_t CUDNN_DEPRECATED;
+/*
+ * Derives a tensor descriptor from layer data descriptor for Normalization
+ * scale, invVariance, bnBias, bnScale tensors. Use this tensor desc for
+ * normScaleBiasMeanVarDesc and normScaleBiasDiffDesc in Normalization forward and backward functions.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnDeriveNormTensorDescriptor(cudnnTensorDescriptor_t derivedNormScaleBiasDesc,
+                                cudnnTensorDescriptor_t derivedNormMeanVarDesc,
+                                const cudnnTensorDescriptor_t xDesc,
+                                cudnnNormMode_t mode,
+                                int groupCnt); /* Place hold for future work, should be set to 1 now*/
+typedef enum {
+    CUDNN_NORM_OPS_NORM                = 0, /* do normalization only */
+    CUDNN_NORM_OPS_NORM_ACTIVATION     = 1, /* do Norm, then activation */
+    CUDNN_NORM_OPS_NORM_ADD_ACTIVATION = 2, /* do Norm, then elemWiseAdd, then activation */
+} cudnnNormOps_t CUDNN_DEPRECATED;
+/*
+ * Performs Normalization during Inference:
+ * y[i] = normScale[k]*(x[i]-estimatedMean[k])/sqrt(epsilon+estimatedVariance[k]) + normBias[k]
+ * with normScale, normBias, runningMean, runningInvVariance tensors indexed
+ * according to per-channel or per-activation mode. Refer to cudnnNormalizationForwardTraining
+ * above for notes on function arguments.
+ */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardInference(cudnnHandle_t handle,
+                                   cudnnNormMode_t mode,
+                                   cudnnNormOps_t normOps,
+                                   cudnnNormAlgo_t algo,
+                                   const void *alpha, /* alpha[0] = result blend factor */
+                                   const void *beta,  /* beta[0] = dest layer blend factor */
+                                   const cudnnTensorDescriptor_t xDesc,
+                                   const void *x, /* NxCxHxW */
+                                   const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                   const void *normScale,
+                                   const void *normBias,
+                                   const cudnnTensorDescriptor_t normMeanVarDesc,
+                                   const void *estimatedMean,
+                                   const void *estimatedVariance,
+                                   const cudnnTensorDescriptor_t zDesc,
+                                   const void *z,
+                                   cudnnActivationDescriptor_t activationDesc,
+                                   const cudnnTensorDescriptor_t yDesc,
+                                   void *y, /* NxCxHxW */
+                                   double epsilon,
+                                   int groupCnt); /* Place hold for future work*/
+/* APIs for spatial transformer network*/
+typedef enum {
+    CUDNN_SAMPLER_BILINEAR = 0,
+} cudnnSamplerType_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateSpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t *stDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetSpatialTransformerNdDescriptor(cudnnSpatialTransformerDescriptor_t stDesc,
+                                       cudnnSamplerType_t samplerType,
+                                       cudnnDataType_t dataType,
+                                       const int nbDims,
+                                       const int dimA[]);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroySpatialTransformerDescriptor(cudnnSpatialTransformerDescriptor_t stDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorForward(cudnnHandle_t handle,
+                                   const cudnnSpatialTransformerDescriptor_t stDesc,
+                                   const void *theta,
+                                   void *grid);
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerForward(cudnnHandle_t handle,
+                             cudnnSpatialTransformerDescriptor_t stDesc,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *grid,
+                             const void *beta,
+                             cudnnTensorDescriptor_t yDesc,
+                             void *y);
+typedef struct cudnnDropoutStruct *cudnnDropoutDescriptor_t;
+cudnnStatus_t CUDNNWINAPI
+cudnnCreateDropoutDescriptor(cudnnDropoutDescriptor_t *dropoutDesc);
+cudnnStatus_t CUDNNWINAPI
+cudnnDestroyDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc);
+/*helper function to determine size of the states to be passed to cudnnSetDropoutDescriptor */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetStatesSize(cudnnHandle_t handle, size_t *sizeInBytes);
+/*helper function to determine size of the reserve space to be passed to dropout forward/backward calls */
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutGetReserveSpaceSize(cudnnTensorDescriptor_t xdesc, size_t *sizeInBytes);
+cudnnStatus_t CUDNNWINAPI
+cudnnSetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float dropout,
+                          void *states,
+                          size_t stateSizeInBytes,
+                          unsigned long long seed);
+/* Restores the dropout descriptor to a previously saved-off state */
+cudnnStatus_t CUDNNWINAPI
+cudnnRestoreDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                              cudnnHandle_t handle,
+                              float dropout,
+                              void *states,
+                              size_t stateSizeInBytes,
+                              unsigned long long seed);
+cudnnStatus_t CUDNNWINAPI
+cudnnGetDropoutDescriptor(cudnnDropoutDescriptor_t dropoutDesc,
+                          cudnnHandle_t handle,
+                          float *dropout,
+                          void **states,
+                          unsigned long long *seed);
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutForward(cudnnHandle_t handle,
+                    const cudnnDropoutDescriptor_t dropoutDesc,
+                    const cudnnTensorDescriptor_t xdesc,
+                    const void *x,
+                    const cudnnTensorDescriptor_t ydesc,
+                    void *y,
+                    void *reserveSpace,
+                    size_t reserveSpaceSizeInBytes);
+/* TODO: move these enums out to the appropriate submodule */
+typedef enum {
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM         = 0,
+    CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM = 1,
+    CUDNN_CONVOLUTION_FWD_ALGO_GEMM                  = 2,
+    CUDNN_CONVOLUTION_FWD_ALGO_DIRECT                = 3,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT                   = 4,
+    CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING            = 5,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD              = 6,
+    CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED     = 7,
+    CUDNN_CONVOLUTION_FWD_ALGO_COUNT                 = 8
+} cudnnConvolutionFwdAlgo_t;
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_3                 = 3, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD          = 4, /* not implemented */
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING        = 6,
+    CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT             = 7
+} cudnnConvolutionBwdFilterAlgo_t;
+typedef enum {
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_0                 = 0, /* non-deterministic */
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_1                 = 1,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT               = 2,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING        = 3,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD          = 4,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED = 5,
+    CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT             = 6
+} cudnnConvolutionBwdDataAlgo_t;
+typedef enum { CUDNN_CTC_LOSS_ALGO_DETERMINISTIC = 0, CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC = 1 } cudnnCTCLossAlgo_t;
+/*
+ * \brief Cross-library version checker.
+ * This function is implemented differently in each sub-library. Each sublib
+ * checks whether its own version matches that of its dependencies.
+ * \returns CUDNN_STATUS_SUCCESS if the version check passes,
+ *          CUDNN_STATUS_SUBLIBRARY_VERSION_MISMATCH if the versions are inconsistent.
+ */
+cudnnStatus_t CUDNNWINAPI
+cudnnOpsVersionCheck(void);
+/* Function to perform backward softmax */
+cudnnStatus_t CUDNNWINAPI
+cudnnSoftmaxBackward(cudnnHandle_t handle,
+                     cudnnSoftmaxAlgorithm_t algo,
+                     cudnnSoftmaxMode_t mode,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+/* Function to perform backward pooling */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnPoolingBackward(cudnnHandle_t handle,
+                     const cudnnPoolingDescriptor_t poolingDesc,
+                     const void *alpha,
+                     const cudnnTensorDescriptor_t yDesc,
+                     const void *y,
+                     const cudnnTensorDescriptor_t dyDesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t xDesc,
+                     const void *x,
+                     const void *beta,
+                     const cudnnTensorDescriptor_t dxDesc,
+                     void *dx);
+/* Function to perform backward activation  */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnActivationBackward(cudnnHandle_t handle,
+                        cudnnActivationDescriptor_t activationDesc,
+                        const void *alpha,
+                        const cudnnTensorDescriptor_t yDesc,
+                        const void *y,
+                        const cudnnTensorDescriptor_t dyDesc,
+                        const void *dy,
+                        const cudnnTensorDescriptor_t xDesc,
+                        const void *x,
+                        const void *beta,
+                        const cudnnTensorDescriptor_t dxDesc,
+                        void *dx);
+/* LRN cross-channel backward computation. Double parameters cast to tensor data type */
+cudnnStatus_t CUDNNWINAPI
+cudnnLRNCrossChannelBackward(cudnnHandle_t handle,
+                             cudnnLRNDescriptor_t normDesc,
+                             cudnnLRNMode_t lrnMode,
+                             const void *alpha,
+                             const cudnnTensorDescriptor_t yDesc,
+                             const void *y,
+                             const cudnnTensorDescriptor_t dyDesc,
+                             const void *dy,
+                             const cudnnTensorDescriptor_t xDesc,
+                             const void *x,
+                             const void *beta,
+                             const cudnnTensorDescriptor_t dxDesc,
+                             void *dx);
+cudnnStatus_t CUDNNWINAPI
+cudnnDivisiveNormalizationBackward(cudnnHandle_t handle,
+                                   cudnnLRNDescriptor_t normDesc,
+                                   cudnnDivNormMode_t mode,
+                                   const void *alpha,
+                                   const cudnnTensorDescriptor_t xDesc, /* same desc for x, means, dy, temp, temp2 */
+                                   const void *x,
+                                   const void *means, /* if NULL, means are assumed to be zero */
+                                   const void *dy,
+                                   void *temp,
+                                   void *temp2,
+                                   const void *beta,
+                                   const cudnnTensorDescriptor_t dXdMeansDesc, /* same desc for dx, dMeans */
+                                   void *dx,                                   /* output x differential */
+                                   void *dMeans); /* output means differential, can be NULL */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize(cudnnHandle_t handle,
+                                                         cudnnBatchNormMode_t mode,
+                                                         cudnnBatchNormOps_t bnOps,
+                                                         const cudnnTensorDescriptor_t xDesc,
+                                                         const cudnnTensorDescriptor_t zDesc,
+                                                         const cudnnTensorDescriptor_t yDesc,
+                                                         const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+                                                         const cudnnActivationDescriptor_t activationDesc,
+                                                         size_t *sizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationBackwardExWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnBatchNormMode_t mode,
+                                                  cudnnBatchNormOps_t bnOps,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t dyDesc,
+                                                  const cudnnTensorDescriptor_t dzDesc,
+                                                  const cudnnTensorDescriptor_t dxDesc,
+                                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  size_t *sizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetBatchNormalizationTrainingExReserveSpaceSize(cudnnHandle_t handle,
+                                                     cudnnBatchNormMode_t mode,
+                                                     cudnnBatchNormOps_t bnOps,
+                                                     const cudnnActivationDescriptor_t activationDesc,
+                                                     const cudnnTensorDescriptor_t xDesc,
+                                                     size_t *sizeInBytes);
+/* Computes y = BN(x). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTraining(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc,
+    const void *x, /* NxCxHxW */
+    const cudnnTensorDescriptor_t yDesc,
+    void *y, /* NxCxHxW */
+    /* Shared desc for the next 6 tensors in the argument list.
+       Data type to be set as follows:
+       type = (typeOf(x) == double) ? double : float
+       Dimensions for this descriptor depend on normalization mode
+       - Spatial Normalization : tensors are expected to have dims 1xCx1x1
+        (normalization is performed across NxHxW)
+       - Per-Activation Normalization : tensors are expected to have dims of 1xCxHxW
+        (normalization is performed across N) */
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    /* 'Gamma' and 'Beta' respectively in Ioffe and Szegedy's paper's notation */
+    const void *bnScale,
+    const void *bnBias,
+    /* MUST use factor=1 in the very first call of a complete training cycle.
+       Use a factor=1/(1+n) at N-th call to the function to get
+       Cumulative Moving Average (CMA) behavior
+       CMA[n] = (x[1]+...+x[n])/n
+       Since CMA[n+1] = (n*CMA[n]+x[n+1])/(n+1) =
+       ((n+1)*CMA[n]-CMA[n])/(n+1) + x[n+1]/(n+1) =
+       CMA[n]*(1-1/(n+1)) + x[n+1]*1/(n+1) */
+    double exponentialAverageFactor,
+    /* Used in Training phase only.
+       runningMean = newMean*factor + runningMean*(1-factor) */
+    void *resultRunningMean,
+    /* Output in training mode, input in inference. Is the moving average
+       of  variance[x] (factor is applied in the same way as for runningMean) */
+    void *resultRunningVariance,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance);
+/* Computes y = relu(BN(x) + z). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationForwardTrainingEx(
+    cudnnHandle_t handle,
+    cudnnBatchNormMode_t mode,
+    cudnnBatchNormOps_t bnOps,
+    const void *alpha, /* alpha[0] = result blend factor */
+    const void *beta,  /* beta[0] = dest layer blend factor */
+    const cudnnTensorDescriptor_t xDesc,
+    const void *xData,
+    const cudnnTensorDescriptor_t zDesc,
+    const void *zData,
+    const cudnnTensorDescriptor_t yDesc,
+    void *yData,
+    const cudnnTensorDescriptor_t bnScaleBiasMeanVarDesc,
+    const void *bnScale,
+    const void *bnBias,
+    double exponentialAverageFactor,
+    void *resultRunningMean,
+    void *resultRunningVariance,
+    /* Has to be >= CUDNN_BN_MIN_EPSILON. Should be the same in forward and backward functions. */
+    double epsilon,
+    /* Optionally save intermediate results from the forward pass here
+       - can be reused to speed up backward pass. NULL if unused */
+    void *resultSaveMean,
+    void *resultSaveInvVariance,
+    cudnnActivationDescriptor_t activationDesc,
+    void *workspace,
+    size_t workSpaceSizeInBytes,
+    void *reserveSpace,
+    size_t reserveSpaceSizeInBytes);
+/* Performs backward pass of Batch Normalization layer. Returns x gradient,
+* bnScale gradient and bnBias gradient */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackward(cudnnHandle_t handle,
+                                cudnnBatchNormMode_t mode,
+                                const void *alphaDataDiff,
+                                const void *betaDataDiff,
+                                const void *alphaParamDiff,
+                                const void *betaParamDiff,
+                                const cudnnTensorDescriptor_t xDesc, /* same desc for x, dx, dy */
+                                const void *x,
+                                const cudnnTensorDescriptor_t dyDesc,
+                                const void *dy,
+                                const cudnnTensorDescriptor_t dxDesc,
+                                void *dx,
+                                /* Shared tensor desc for the 4 tensors below */
+                                const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                const void *bnScale, /* bnBias doesn't affect backpropagation */
+                                /* scale and bias diff are not backpropagated below this layer */
+                                void *dBnScaleResult,
+                                void *dBnBiasResult,
+                                /* Same epsilon as forward pass */
+                                double epsilon,
+                                /* Optionally cached intermediate results from
+                                   forward pass */
+                                const void *savedMean,
+                                const void *savedInvVariance);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnBatchNormalizationBackwardEx(cudnnHandle_t handle,
+                                  cudnnBatchNormMode_t mode,
+                                  cudnnBatchNormOps_t bnOps,
+                                  const void *alphaDataDiff,
+                                  const void *betaDataDiff,
+                                  const void *alphaParamDiff,
+                                  const void *betaParamDiff,
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  const void *yData,
+                                  const cudnnTensorDescriptor_t dyDesc,
+                                  const void *dyData,
+                                  const cudnnTensorDescriptor_t dzDesc,
+                                  void *dzData,
+                                  const cudnnTensorDescriptor_t dxDesc,
+                                  void *dxData,
+                                  /* Shared tensor desc for the 4 tensors below */
+                                  const cudnnTensorDescriptor_t dBnScaleBiasDesc,
+                                  const void *bnScaleData,
+                                  const void *bnBiasData, /* needed if there is activation */
+                                  void *dBnScaleData,
+                                  void *dBnBiasData,
+                                  double epsilon, /* Same epsilon as forward pass */
+                                  /* Optionally cached intermediate results from
+                                     forward pass */
+                                  const void *savedMean,
+                                  const void *savedInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  void *workSpace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes);
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationForwardTrainingWorkspaceSize(cudnnHandle_t handle,
+                                                  cudnnNormMode_t mode,
+                                                  cudnnNormOps_t normOps,
+                                                  cudnnNormAlgo_t algo,
+                                                  const cudnnTensorDescriptor_t xDesc,
+                                                  const cudnnTensorDescriptor_t zDesc,
+                                                  const cudnnTensorDescriptor_t yDesc,
+                                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                                  const cudnnActivationDescriptor_t activationDesc,
+                                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                                  size_t *sizeInBytes,
+                                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationBackwardWorkspaceSize(cudnnHandle_t handle,
+                                           cudnnNormMode_t mode,
+                                           cudnnNormOps_t normOps,
+                                           cudnnNormAlgo_t algo,
+                                           const cudnnTensorDescriptor_t xDesc,
+                                           const cudnnTensorDescriptor_t yDesc,
+                                           const cudnnTensorDescriptor_t dyDesc,
+                                           const cudnnTensorDescriptor_t dzDesc,
+                                           const cudnnTensorDescriptor_t dxDesc,
+                                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                                           const cudnnActivationDescriptor_t activationDesc,
+                                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                                           size_t *sizeInBytes,
+                                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnGetNormalizationTrainingReserveSpaceSize(cudnnHandle_t handle,
+                                              cudnnNormMode_t mode,
+                                              cudnnNormOps_t normOps,
+                                              cudnnNormAlgo_t algo,
+                                              const cudnnActivationDescriptor_t activationDesc,
+                                              const cudnnTensorDescriptor_t xDesc,
+                                              size_t *sizeInBytes,
+                                              int groupCnt); /* Place hold for future work, should be set to 1 now*/
+/* Computes y = relu(Norm(x) + z). Also accumulates moving averages of mean and inverse variances */
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationForwardTraining(cudnnHandle_t handle,
+                                  cudnnNormMode_t mode,
+                                  cudnnNormOps_t normOps,
+                                  cudnnNormAlgo_t algo,
+                                  const void *alpha, /* alpha[0] = result blend factor */
+                                  const void *beta,  /* beta[0] = dest layer blend factor */
+                                  const cudnnTensorDescriptor_t xDesc,
+                                  const void *xData,
+                                  const cudnnTensorDescriptor_t normScaleBiasDesc,
+                                  const void *normScale,
+                                  const void *normBias,
+                                  double exponentialAverageFactor,
+                                  const cudnnTensorDescriptor_t normMeanVarDesc,
+                                  void *resultRunningMean,
+                                  void *resultRunningVariance,
+                                  /* Has to be >= 0. Should be the same in forward and backward functions. */
+                                  double epsilon,
+                                  /* Optionally save intermediate results from the forward pass here
+                                     - can be reused to speed up backward pass. NULL if unused */
+                                  void *resultSaveMean,
+                                  void *resultSaveInvVariance,
+                                  cudnnActivationDescriptor_t activationDesc,
+                                  const cudnnTensorDescriptor_t zDesc,
+                                  const void *zData,
+                                  const cudnnTensorDescriptor_t yDesc,
+                                  void *yData,
+                                  void *workspace,
+                                  size_t workSpaceSizeInBytes,
+                                  void *reserveSpace,
+                                  size_t reserveSpaceSizeInBytes,
+                                  int groupCnt); /* Place hold for future work, should be set to 1 now*/
+CUDNN_DEPRECATED cudnnStatus_t CUDNNWINAPI
+cudnnNormalizationBackward(cudnnHandle_t handle,
+                           cudnnNormMode_t mode,
+                           cudnnNormOps_t normOps,
+                           cudnnNormAlgo_t algo,
+                           const void *alphaDataDiff,
+                           const void *betaDataDiff,
+                           const void *alphaParamDiff,
+                           const void *betaParamDiff,
+                           const cudnnTensorDescriptor_t xDesc,
+                           const void *xData,
+                           const cudnnTensorDescriptor_t yDesc,
+                           const void *yData,
+                           const cudnnTensorDescriptor_t dyDesc,
+                           const void *dyData,
+                           const cudnnTensorDescriptor_t dzDesc,
+                           void *dzData,
+                           const cudnnTensorDescriptor_t dxDesc,
+                           void *dxData,
+                           /* Shared tensor desc for the 4 tensors below */
+                           const cudnnTensorDescriptor_t dNormScaleBiasDesc,
+                           const void *normScaleData,
+                           const void *normBiasData, /* needed if there is activation */
+                           void *dNormScaleData,
+                           void *dNormBiasData,
+                           double epsilon, /* Same epsilon as forward pass */
+                           const cudnnTensorDescriptor_t normMeanVarDesc,
+                           /* Optionally cached intermediate results from
+                              forward pass */
+                           const void *savedMean,
+                           const void *savedInvVariance,
+                           cudnnActivationDescriptor_t activationDesc,
+                           void *workSpace,
+                           size_t workSpaceSizeInBytes,
+                           void *reserveSpace,
+                           size_t reserveSpaceSizeInBytes,
+                           int groupCnt); /* Place hold for future work, should be set to 1 now*/
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfGridGeneratorBackward(cudnnHandle_t handle,
+                                    const cudnnSpatialTransformerDescriptor_t stDesc,
+                                    const void *dgrid,
+                                    void *dtheta);
+cudnnStatus_t CUDNNWINAPI
+cudnnSpatialTfSamplerBackward(cudnnHandle_t handle,
+                              cudnnSpatialTransformerDescriptor_t stDesc,
+                              const void *alpha,
+                              const cudnnTensorDescriptor_t xDesc,
+                              const void *x,
+                              const void *beta,
+                              const cudnnTensorDescriptor_t dxDesc,
+                              void *dx,
+                              const void *alphaDgrid,
+                              const cudnnTensorDescriptor_t dyDesc,
+                              const void *dy,
+                              const void *grid,
+                              const void *betaDgrid,
+                              void *dgrid);
+cudnnStatus_t CUDNNWINAPI
+cudnnDropoutBackward(cudnnHandle_t handle,
+                     const cudnnDropoutDescriptor_t dropoutDesc,
+                     const cudnnTensorDescriptor_t dydesc,
+                     const void *dy,
+                     const cudnnTensorDescriptor_t dxdesc,
+                     void *dx,
+                     void *reserveSpace,
+                     size_t reserveSpaceSizeInBytes);
+#if defined(__cplusplus)
+}
+#endif
+#endif /* CUDNN_OPS_H_ */

.venv/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn_version_v9.h ADDED Viewed

	@@ -0,0 +1,70 @@

+/*
+ * Copyright 2014-2023 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+/**
+ * \file: The master cuDNN version file.
+ */
+#ifndef CUDNN_VERSION_H_
+#define CUDNN_VERSION_H_
+#define CUDNN_MAJOR 9
+#define CUDNN_MINOR 1
+#define CUDNN_PATCHLEVEL 0
+#define CUDNN_VERSION (CUDNN_MAJOR * 10000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)
+/* cannot use constexpr here since this is a C-only file */
+/* Below is the max SM version this cuDNN library is aware of and supports natively */
+#define CUDNN_MAX_SM_MAJOR_NUMBER 9
+#define CUDNN_MAX_SM_MINOR_NUMBER 0
+#define CUDNN_MAX_DEVICE_VERSION (CUDNN_MAX_SM_MAJOR_NUMBER * 100 + CUDNN_MAX_SM_MINOR_NUMBER * 10)
+#endif /* CUDNN_VERSION_H */

.venv/lib/python3.11/site-packages/nvidia/cusolver/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/cusolver/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (188 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/cusolver/include/__init__.py ADDED Viewed

File without changes

.venv/lib/python3.11/site-packages/nvidia/cusolver/include/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (196 Bytes). View file

.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverDn.h ADDED Viewed

The diff for this file is too large to render. See raw diff

.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverMg.h ADDED Viewed

	@@ -0,0 +1,318 @@

+/*
+ * Copyright 2019 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(CUSOLVERMG_H_)
+  #define CUSOLVERMG_H_
+  #include <stdint.h>
+  #include "cusolverDn.h"
+  #if defined(__cplusplus)
+extern "C" {
+  #endif /* __cplusplus */
+  struct cusolverMgContext;
+  typedef struct cusolverMgContext *cusolverMgHandle_t;
+  /**
+   * \beief This enum decides how 1D device Ids (or process ranks) get mapped to
+   * a 2D grid.
+   */
+  typedef enum {
+    CUDALIBMG_GRID_MAPPING_ROW_MAJOR = 1,
+    CUDALIBMG_GRID_MAPPING_COL_MAJOR = 0
+  } cusolverMgGridMapping_t;
+  /** \brief Opaque structure of the distributed grid */
+  typedef void *cudaLibMgGrid_t;
+  /** \brief Opaque structure of the distributed matrix descriptor */
+  typedef void *cudaLibMgMatrixDesc_t;
+  cusolverStatus_t CUSOLVERAPI cusolverMgCreate(cusolverMgHandle_t *handle);
+  cusolverStatus_t CUSOLVERAPI cusolverMgDestroy(cusolverMgHandle_t handle);
+  cusolverStatus_t CUSOLVERAPI cusolverMgDeviceSelect(
+    cusolverMgHandle_t handle,
+    int                nbDevices,
+    int                deviceId[]);
+  /**
+   * \brief Allocates resources related to the shared memory device grid.
+   * \param[out] grid the opaque data strcuture that holds the grid
+   * \param[in] numRowDevices number of devices in the row
+   * \param[in] numColDevices number of devices in the column
+   * \param[in] deviceId This array of size height * width stores the
+   *            device-ids of the 2D grid; each entry must correspond to a valid
+   * gpu or to -1 (denoting CPU). \param[in] mapping whether the 2D grid is in
+   * row/column major \returns the status code
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverMgCreateDeviceGrid(
+    cudaLibMgGrid_t *       grid,
+    int32_t                 numRowDevices,
+    int32_t                 numColDevices,
+    const int32_t           deviceId[],
+    cusolverMgGridMapping_t mapping);
+  /**
+   * \brief Releases the allocated resources related to the distributed grid.
+   * \param[in] grid the opaque data strcuture that holds the distributed grid
+   * \returns the status code
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverMgDestroyGrid(cudaLibMgGrid_t grid);
+  /**
+   * \brief Allocates resources related to the distributed matrix descriptor.
+   * \param[out] desc the opaque data strcuture that holds the descriptor
+   * \param[in] numRows number of total rows
+   * \param[in] numCols number of total columns
+   * \param[in] rowBlockSize row block size
+   * \param[in] colBlockSize column block size
+   * \param[in] dataType the data type of each element in cudaDataType
+   * \param[in] grid the opaque data structure of the distributed grid
+   * \returns the status code
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverMgCreateMatrixDesc(
+    cudaLibMgMatrixDesc_t *desc,
+    int64_t                numRows,
+    int64_t                numCols,
+    int64_t                rowBlockSize,
+    int64_t                colBlockSize,
+    cudaDataType           dataType,
+    const cudaLibMgGrid_t  grid);
+  /**
+   * \brief Releases the allocated resources related to the distributed matrix
+   * descriptor. \param[in] desc the opaque data strcuture that holds the
+   * descriptor \returns the status code
+   */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverMgDestroyMatrixDesc(cudaLibMgMatrixDesc_t desc);
+  cusolverStatus_t CUSOLVERAPI cusolverMgSyevd_bufferSize(
+    cusolverMgHandle_t    handle,
+    cusolverEigMode_t     jobz,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    void *                W,
+    cudaDataType          dataTypeW,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+  cusolverStatus_t CUSOLVERAPI cusolverMgSyevd(
+    cusolverMgHandle_t    handle,
+    cusolverEigMode_t     jobz,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    void *                W,
+    cudaDataType          dataTypeW,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 info);
+  cusolverStatus_t CUSOLVERAPI cusolverMgGetrf_bufferSize(
+    cusolverMgHandle_t    handle,
+    int                   M,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    int *                 array_d_IPIV[],
+    cudaDataType          computeType,
+    int64_t *             lwork);
+  cusolverStatus_t CUSOLVERAPI cusolverMgGetrf(
+    cusolverMgHandle_t    handle,
+    int                   M,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    int *                 array_d_IPIV[],
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 info);
+  cusolverStatus_t CUSOLVERAPI cusolverMgGetrs_bufferSize(
+    cusolverMgHandle_t    handle,
+    cublasOperation_t     TRANS,
+    int                   N,
+    int                   NRHS,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    int *                 array_d_IPIV[],
+    void *                array_d_B[],
+    int                   IB,
+    int                   JB,
+    cudaLibMgMatrixDesc_t descrB,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+  cusolverStatus_t CUSOLVERAPI cusolverMgGetrs(
+    cusolverMgHandle_t    handle,
+    cublasOperation_t     TRANS,
+    int                   N,
+    int                   NRHS,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    int *                 array_d_IPIV[],
+    void *                array_d_B[],
+    int                   IB,
+    int                   JB,
+    cudaLibMgMatrixDesc_t descrB,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 info);
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotrf_bufferSize(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotrf(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 h_info);
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotrs_bufferSize(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   n,
+    int                   nrhs,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    void *                array_d_B[],
+    int                   IB,
+    int                   JB,
+    cudaLibMgMatrixDesc_t descrB,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotrs(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   n,
+    int                   nrhs,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    void *                array_d_B[],
+    int                   IB,
+    int                   JB,
+    cudaLibMgMatrixDesc_t descrB,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 h_info);
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotri_bufferSize(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    cudaDataType          computeType,
+    int64_t *             lwork);
+  cusolverStatus_t CUSOLVERAPI cusolverMgPotri(
+    cusolverMgHandle_t    handle,
+    cublasFillMode_t      uplo,
+    int                   N,
+    void *                array_d_A[],
+    int                   IA,
+    int                   JA,
+    cudaLibMgMatrixDesc_t descrA,
+    cudaDataType          computeType,
+    void *                array_d_work[],
+    int64_t               lwork,
+    int *                 h_info);
+  #if defined(__cplusplus)
+}
+  #endif /* __cplusplus */
+#endif // CUSOLVERMG_H_

.venv/lib/python3.11/site-packages/nvidia/cusolver/include/cusolverRf.h ADDED Viewed

	@@ -0,0 +1,339 @@

+/*
+ * Copyright 1993-2014 NVIDIA Corporation.  All rights reserved.
+ *
+ * NOTICE TO LICENSEE:
+ *
+ * This source code and/or documentation ("Licensed Deliverables") are
+ * subject to NVIDIA intellectual property rights under U.S. and
+ * international Copyright laws.
+ *
+ * These Licensed Deliverables contained herein is PROPRIETARY and
+ * CONFIDENTIAL to NVIDIA and is being provided under the terms and
+ * conditions of a form of NVIDIA software license agreement by and
+ * between NVIDIA and Licensee ("License Agreement") or electronically
+ * accepted by Licensee.  Notwithstanding any terms or conditions to
+ * the contrary in the License Agreement, reproduction or disclosure
+ * of the Licensed Deliverables to any third party without the express
+ * written consent of NVIDIA is prohibited.
+ *
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
+ * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE.  IT IS
+ * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
+ * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
+ * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
+ * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
+ * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
+ * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
+ * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
+ * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THESE LICENSED DELIVERABLES.
+ *
+ * U.S. Government End Users.  These Licensed Deliverables are a
+ * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
+ * 1995), consisting of "commercial computer software" and "commercial
+ * computer software documentation" as such terms are used in 48
+ * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
+ * only as a commercial end item.  Consistent with 48 C.F.R.12.212 and
+ * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
+ * U.S. Government End Users acquire the Licensed Deliverables with
+ * only those rights set forth herein.
+ *
+ * Any use of the Licensed Deliverables in individual and commercial
+ * software must include, in the user documentation and internal
+ * comments to the code, the above Disclaimer and U.S. Government End
+ * Users Notice.
+ */
+#if !defined(CUSOLVERRF_H_)
+  #define CUSOLVERRF_H_
+  #include "driver_types.h"
+  #include "cuComplex.h"
+  #include "cusolver_common.h"
+  #if defined(__cplusplus)
+extern "C" {
+  #endif /* __cplusplus */
+  /* CUSOLVERRF mode */
+  typedef enum {
+    CUSOLVERRF_RESET_VALUES_FAST_MODE_OFF = 0, // default
+    CUSOLVERRF_RESET_VALUES_FAST_MODE_ON = 1
+  } cusolverRfResetValuesFastMode_t;
+  /* CUSOLVERRF matrix format */
+  typedef enum {
+    CUSOLVERRF_MATRIX_FORMAT_CSR = 0, // default
+    CUSOLVERRF_MATRIX_FORMAT_CSC = 1
+  } cusolverRfMatrixFormat_t;
+  /* CUSOLVERRF unit diagonal */
+  typedef enum {
+    CUSOLVERRF_UNIT_DIAGONAL_STORED_L = 0, // default
+    CUSOLVERRF_UNIT_DIAGONAL_STORED_U = 1,
+    CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_L = 2,
+    CUSOLVERRF_UNIT_DIAGONAL_ASSUMED_U = 3
+  } cusolverRfUnitDiagonal_t;
+  /* CUSOLVERRF factorization algorithm */
+  typedef enum {
+    CUSOLVERRF_FACTORIZATION_ALG0 = 0, // default
+    CUSOLVERRF_FACTORIZATION_ALG1 = 1,
+    CUSOLVERRF_FACTORIZATION_ALG2 = 2,
+  } cusolverRfFactorization_t;
+  /* CUSOLVERRF triangular solve algorithm */
+  typedef enum {
+    CUSOLVERRF_TRIANGULAR_SOLVE_ALG1 = 1, // default
+    CUSOLVERRF_TRIANGULAR_SOLVE_ALG2 = 2,
+    CUSOLVERRF_TRIANGULAR_SOLVE_ALG3 = 3
+  } cusolverRfTriangularSolve_t;
+  /* CUSOLVERRF numeric boost report */
+  typedef enum {
+    CUSOLVERRF_NUMERIC_BOOST_NOT_USED = 0, // default
+    CUSOLVERRF_NUMERIC_BOOST_USED = 1
+  } cusolverRfNumericBoostReport_t;
+  /* Opaque structure holding CUSOLVERRF library common */
+  struct cusolverRfCommon;
+  typedef struct cusolverRfCommon* cusolverRfHandle_t;
+  /* CUSOLVERRF create (allocate memory) and destroy (free memory) in the handle
+   */
+  cusolverStatus_t CUSOLVERAPI cusolverRfCreate(cusolverRfHandle_t* handle);
+  cusolverStatus_t CUSOLVERAPI cusolverRfDestroy(cusolverRfHandle_t handle);
+  /* CUSOLVERRF set and get input format */
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetMatrixFormat(
+    cusolverRfHandle_t        handle,
+    cusolverRfMatrixFormat_t* format,
+    cusolverRfUnitDiagonal_t* diag);
+  cusolverStatus_t CUSOLVERAPI cusolverRfSetMatrixFormat(
+    cusolverRfHandle_t       handle,
+    cusolverRfMatrixFormat_t format,
+    cusolverRfUnitDiagonal_t diag);
+  /* CUSOLVERRF set and get numeric properties */
+  cusolverStatus_t CUSOLVERAPI cusolverRfSetNumericProperties(
+    cusolverRfHandle_t handle,
+    double             zero,
+    double             boost);
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetNumericProperties(
+    cusolverRfHandle_t handle,
+    double*            zero,
+    double*            boost);
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetNumericBoostReport(
+    cusolverRfHandle_t              handle,
+    cusolverRfNumericBoostReport_t* report);
+  /* CUSOLVERRF choose the triangular solve algorithm */
+  cusolverStatus_t CUSOLVERAPI cusolverRfSetAlgs(
+    cusolverRfHandle_t          handle,
+    cusolverRfFactorization_t   factAlg,
+    cusolverRfTriangularSolve_t solveAlg);
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetAlgs(
+    cusolverRfHandle_t           handle,
+    cusolverRfFactorization_t*   factAlg,
+    cusolverRfTriangularSolve_t* solveAlg);
+  /* CUSOLVERRF set and get fast mode */
+  cusolverStatus_t CUSOLVERAPI cusolverRfGetResetValuesFastMode(
+    cusolverRfHandle_t               handle,
+    cusolverRfResetValuesFastMode_t* fastMode);
+  cusolverStatus_t CUSOLVERAPI cusolverRfSetResetValuesFastMode(
+    cusolverRfHandle_t              handle,
+    cusolverRfResetValuesFastMode_t fastMode);
+  /*** Non-Batched Routines ***/
+  /* CUSOLVERRF setup of internal structures from host or device memory */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfSetupHost(/* Input (in the host memory) */
+                        int     n,
+                        int     nnzA,
+                        int*    h_csrRowPtrA,
+                        int*    h_csrColIndA,
+                        double* h_csrValA,
+                        int     nnzL,
+                        int*    h_csrRowPtrL,
+                        int*    h_csrColIndL,
+                        double* h_csrValL,
+                        int     nnzU,
+                        int*    h_csrRowPtrU,
+                        int*    h_csrColIndU,
+                        double* h_csrValU,
+                        int*    h_P,
+                        int*    h_Q,
+                        /* Output */
+                        cusolverRfHandle_t handle);
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfSetupDevice(/* Input (in the device memory) */
+                          int     n,
+                          int     nnzA,
+                          int*    csrRowPtrA,
+                          int*    csrColIndA,
+                          double* csrValA,
+                          int     nnzL,
+                          int*    csrRowPtrL,
+                          int*    csrColIndL,
+                          double* csrValL,
+                          int     nnzU,
+                          int*    csrRowPtrU,
+                          int*    csrColIndU,
+                          double* csrValU,
+                          int*    P,
+                          int*    Q,
+                          /* Output */
+                          cusolverRfHandle_t handle);
+  /* CUSOLVERRF update the matrix values (assuming the reordering, pivoting
+     and consequently the sparsity pattern of L and U did not change),
+     and zero out the remaining values. */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfResetValues(/* Input (in the device memory) */
+                          int     n,
+                          int     nnzA,
+                          int*    csrRowPtrA,
+                          int*    csrColIndA,
+                          double* csrValA,
+                          int*    P,
+                          int*    Q,
+                          /* Output */
+                          cusolverRfHandle_t handle);
+  /* CUSOLVERRF analysis (for parallelism) */
+  cusolverStatus_t CUSOLVERAPI cusolverRfAnalyze(cusolverRfHandle_t handle);
+  /* CUSOLVERRF re-factorization (for parallelism) */
+  cusolverStatus_t CUSOLVERAPI cusolverRfRefactor(cusolverRfHandle_t handle);
+  /* CUSOLVERRF extraction: Get L & U packed into a single matrix M */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfAccessBundledFactorsDevice(/* Input */
+                                         cusolverRfHandle_t handle,
+                                         /* Output (in the host memory) */
+                                         int* nnzM,
+                                         /* Output (in the device memory) */
+                                         int**    Mp,
+                                         int**    Mi,
+                                         double** Mx);
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfExtractBundledFactorsHost(/* Input */
+                                        cusolverRfHandle_t handle,
+                                        /* Output (in the host memory) */
+                                        int*     h_nnzM,
+                                        int**    h_Mp,
+                                        int**    h_Mi,
+                                        double** h_Mx);
+  /* CUSOLVERRF extraction: Get L & U individually */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfExtractSplitFactorsHost(/* Input */
+                                      cusolverRfHandle_t handle,
+                                      /* Output (in the host memory) */
+                                      int*     h_nnzL,
+                                      int**    h_csrRowPtrL,
+                                      int**    h_csrColIndL,
+                                      double** h_csrValL,
+                                      int*     h_nnzU,
+                                      int**    h_csrRowPtrU,
+                                      int**    h_csrColIndU,
+                                      double** h_csrValU);
+  /* CUSOLVERRF (forward and backward triangular) solves */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfSolve(/* Input (in the device memory) */
+                    cusolverRfHandle_t handle,
+                    int*               P,
+                    int*               Q,
+                    int                nrhs, // only nrhs=1 is supported
+                    double*            Temp, // of size ldt*nrhs (ldt>=n)
+                    int                ldt,
+                    /* Input/Output (in the device memory) */
+                    double* XF,
+                    /* Input */
+                    int ldxf);
+  /*** Batched Routines ***/
+  /* CUSOLVERRF-batch setup of internal structures from host */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchSetupHost(/* Input (in the host memory)*/
+                             int     batchSize,
+                             int     n,
+                             int     nnzA,
+                             int*    h_csrRowPtrA,
+                             int*    h_csrColIndA,
+                             double* h_csrValA_array[],
+                             int     nnzL,
+                             int*    h_csrRowPtrL,
+                             int*    h_csrColIndL,
+                             double* h_csrValL,
+                             int     nnzU,
+                             int*    h_csrRowPtrU,
+                             int*    h_csrColIndU,
+                             double* h_csrValU,
+                             int*    h_P,
+                             int*    h_Q,
+                             /* Output (in the device memory) */
+                             cusolverRfHandle_t handle);
+  /* CUSOLVERRF-batch update the matrix values (assuming the reordering,
+     pivoting and consequently the sparsity pattern of L and U did not change),
+     and zero out the remaining values. */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchResetValues(/* Input (in the device memory) */
+                               int     batchSize,
+                               int     n,
+                               int     nnzA,
+                               int*    csrRowPtrA,
+                               int*    csrColIndA,
+                               double* csrValA_array[],
+                               int*    P,
+                               int*    Q,
+                               /* Output */
+                               cusolverRfHandle_t handle);
+  /* CUSOLVERRF-batch analysis (for parallelism) */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchAnalyze(cusolverRfHandle_t handle);
+  /* CUSOLVERRF-batch re-factorization (for parallelism) */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchRefactor(cusolverRfHandle_t handle);
+  /* CUSOLVERRF-batch (forward and backward triangular) solves */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchSolve(/* Input (in the device memory) */
+                         cusolverRfHandle_t handle,
+                         int*               P,
+                         int*               Q,
+                         int                nrhs, // only nrhs=1 is supported
+                         double* Temp, // of size 2*batchSize*(n*nrhs)
+                         int     ldt,  // only ldt=n is supported
+                         /* Input/Output (in the device memory) */
+                         double* XF_array[],
+                         /* Input */
+                         int ldxf);
+  /* CUSOLVERRF-batch obtain the position of zero pivot */
+  cusolverStatus_t CUSOLVERAPI
+    cusolverRfBatchZeroPivot(/* Input */
+                             cusolverRfHandle_t handle,
+                             /* Output (in the host memory) */
+                             int* position);
+  #if defined(__cplusplus)
+}
+  #endif /* __cplusplus */
+#endif /* CUSOLVERRF_H_ */