diff --git "a/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h" "b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h" new file mode 100644--- /dev/null +++ "b/tuning-competition-baseline/.venv/lib/python3.11/site-packages/nvidia/cublas/include/cublas_api.h" @@ -0,0 +1,3478 @@ +/* + * Copyright 1993-2022 NVIDIA Corporation. All rights reserved. + * + * NOTICE TO LICENSEE: + * + * This source code and/or documentation ("Licensed Deliverables") are + * subject to NVIDIA intellectual property rights under U.S. and + * international Copyright laws. + * + * These Licensed Deliverables contained herein is PROPRIETARY and + * CONFIDENTIAL to NVIDIA and is being provided under the terms and + * conditions of a form of NVIDIA software license agreement by and + * between NVIDIA and Licensee ("License Agreement") or electronically + * accepted by Licensee. Notwithstanding any terms or conditions to + * the contrary in the License Agreement, reproduction or disclosure + * of the Licensed Deliverables to any third party without the express + * written consent of NVIDIA is prohibited. + * + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE + * SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS + * PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND. + * NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED + * DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY, + * NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE. + * NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE + * LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY + * SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY + * DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, + * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THESE LICENSED DELIVERABLES. + * + * U.S. Government End Users. These Licensed Deliverables are a + * "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT + * 1995), consisting of "commercial computer software" and "commercial + * computer software documentation" as such terms are used in 48 + * C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government + * only as a commercial end item. Consistent with 48 C.F.R.12.212 and + * 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all + * U.S. Government End Users acquire the Licensed Deliverables with + * only those rights set forth herein. + * + * Any use of the Licensed Deliverables in individual and commercial + * software must include, in the user documentation and internal + * comments to the code, the above Disclaimer and U.S. Government End + * Users Notice. + */ + +/* + * This is the public header file for the CUBLAS library, defining the API + * + * CUBLAS is an implementation of BLAS (Basic Linear Algebra Subroutines) + * on top of the CUDA runtime. + */ + +#if !defined(CUBLAS_API_H_) +#define CUBLAS_API_H_ + +#ifndef CUBLASWINAPI +#ifdef _WIN32 +#define CUBLASWINAPI __stdcall +#else +#define CUBLASWINAPI +#endif +#endif + +#ifndef CUBLASAPI +#error "This file should not be included without defining CUBLASAPI" +#endif + +#include "driver_types.h" +#include "cuComplex.h" /* import complex data type */ + +#include +#include + +#include + +#if defined(__cplusplus) +extern "C" { +#endif /* __cplusplus */ + +#define CUBLAS_VER_MAJOR 11 +#define CUBLAS_VER_MINOR 11 +#define CUBLAS_VER_PATCH 3 +#define CUBLAS_VER_BUILD 6 +#define CUBLAS_VERSION (CUBLAS_VER_MAJOR * 10000 + CUBLAS_VER_MINOR * 100 + CUBLAS_VER_PATCH) + +/* CUBLAS status type returns */ +typedef enum { + CUBLAS_STATUS_SUCCESS = 0, + CUBLAS_STATUS_NOT_INITIALIZED = 1, + CUBLAS_STATUS_ALLOC_FAILED = 3, + CUBLAS_STATUS_INVALID_VALUE = 7, + CUBLAS_STATUS_ARCH_MISMATCH = 8, + CUBLAS_STATUS_MAPPING_ERROR = 11, + CUBLAS_STATUS_EXECUTION_FAILED = 13, + CUBLAS_STATUS_INTERNAL_ERROR = 14, + CUBLAS_STATUS_NOT_SUPPORTED = 15, + CUBLAS_STATUS_LICENSE_ERROR = 16 +} cublasStatus_t; + +typedef enum { CUBLAS_FILL_MODE_LOWER = 0, CUBLAS_FILL_MODE_UPPER = 1, CUBLAS_FILL_MODE_FULL = 2 } cublasFillMode_t; + +typedef enum { CUBLAS_DIAG_NON_UNIT = 0, CUBLAS_DIAG_UNIT = 1 } cublasDiagType_t; + +typedef enum { CUBLAS_SIDE_LEFT = 0, CUBLAS_SIDE_RIGHT = 1 } cublasSideMode_t; + +typedef enum { + CUBLAS_OP_N = 0, + CUBLAS_OP_T = 1, + CUBLAS_OP_C = 2, + CUBLAS_OP_HERMITAN = 2, /* synonym if CUBLAS_OP_C */ + CUBLAS_OP_CONJG = 3 /* conjugate, placeholder - not supported in the current release */ +} cublasOperation_t; + +typedef enum { CUBLAS_POINTER_MODE_HOST = 0, CUBLAS_POINTER_MODE_DEVICE = 1 } cublasPointerMode_t; + +typedef enum { CUBLAS_ATOMICS_NOT_ALLOWED = 0, CUBLAS_ATOMICS_ALLOWED = 1 } cublasAtomicsMode_t; + +/*For different GEMM algorithm */ +typedef enum { + CUBLAS_GEMM_DFALT = -1, + CUBLAS_GEMM_DEFAULT = -1, + CUBLAS_GEMM_ALGO0 = 0, + CUBLAS_GEMM_ALGO1 = 1, + CUBLAS_GEMM_ALGO2 = 2, + CUBLAS_GEMM_ALGO3 = 3, + CUBLAS_GEMM_ALGO4 = 4, + CUBLAS_GEMM_ALGO5 = 5, + CUBLAS_GEMM_ALGO6 = 6, + CUBLAS_GEMM_ALGO7 = 7, + CUBLAS_GEMM_ALGO8 = 8, + CUBLAS_GEMM_ALGO9 = 9, + CUBLAS_GEMM_ALGO10 = 10, + CUBLAS_GEMM_ALGO11 = 11, + CUBLAS_GEMM_ALGO12 = 12, + CUBLAS_GEMM_ALGO13 = 13, + CUBLAS_GEMM_ALGO14 = 14, + CUBLAS_GEMM_ALGO15 = 15, + CUBLAS_GEMM_ALGO16 = 16, + CUBLAS_GEMM_ALGO17 = 17, + CUBLAS_GEMM_ALGO18 = 18, // sliced 32x32 + CUBLAS_GEMM_ALGO19 = 19, // sliced 64x32 + CUBLAS_GEMM_ALGO20 = 20, // sliced 128x32 + CUBLAS_GEMM_ALGO21 = 21, // sliced 32x32 -splitK + CUBLAS_GEMM_ALGO22 = 22, // sliced 64x32 -splitK + CUBLAS_GEMM_ALGO23 = 23, // sliced 128x32 -splitK + CUBLAS_GEMM_DEFAULT_TENSOR_OP = 99, + CUBLAS_GEMM_DFALT_TENSOR_OP = 99, + CUBLAS_GEMM_ALGO0_TENSOR_OP = 100, + CUBLAS_GEMM_ALGO1_TENSOR_OP = 101, + CUBLAS_GEMM_ALGO2_TENSOR_OP = 102, + CUBLAS_GEMM_ALGO3_TENSOR_OP = 103, + CUBLAS_GEMM_ALGO4_TENSOR_OP = 104, + CUBLAS_GEMM_ALGO5_TENSOR_OP = 105, + CUBLAS_GEMM_ALGO6_TENSOR_OP = 106, + CUBLAS_GEMM_ALGO7_TENSOR_OP = 107, + CUBLAS_GEMM_ALGO8_TENSOR_OP = 108, + CUBLAS_GEMM_ALGO9_TENSOR_OP = 109, + CUBLAS_GEMM_ALGO10_TENSOR_OP = 110, + CUBLAS_GEMM_ALGO11_TENSOR_OP = 111, + CUBLAS_GEMM_ALGO12_TENSOR_OP = 112, + CUBLAS_GEMM_ALGO13_TENSOR_OP = 113, + CUBLAS_GEMM_ALGO14_TENSOR_OP = 114, + CUBLAS_GEMM_ALGO15_TENSOR_OP = 115 +} cublasGemmAlgo_t; + +/*Enum for default math mode/tensor operation*/ +typedef enum { + CUBLAS_DEFAULT_MATH = 0, + + /* deprecated, same effect as using CUBLAS_COMPUTE_32F_FAST_16F, will be removed in a future release */ + CUBLAS_TENSOR_OP_MATH = 1, + + /* same as using matching _PEDANTIC compute type when using cublasroutine calls or cublasEx() calls with + cudaDataType as compute type */ + CUBLAS_PEDANTIC_MATH = 2, + + /* allow accelerating single precision routines using TF32 tensor cores */ + CUBLAS_TF32_TENSOR_OP_MATH = 3, + + /* flag to force any reductons to use the accumulator type and not output type in case of mixed precision routines + with lower size output type */ + CUBLAS_MATH_DISALLOW_REDUCED_PRECISION_REDUCTION = 16, +} cublasMath_t; + +/* For backward compatibility purposes */ +typedef cudaDataType cublasDataType_t; + +/* Enum for compute type + * + * - default types provide best available performance using all available hardware features + * and guarantee internal storage precision with at least the same precision and range; + * - _PEDANTIC types ensure standard arithmetic and exact specified internal storage format; + * - _FAST types allow for some loss of precision to enable higher throughput arithmetic. + */ +typedef enum { + CUBLAS_COMPUTE_16F = 64, /* half - default */ + CUBLAS_COMPUTE_16F_PEDANTIC = 65, /* half - pedantic */ + CUBLAS_COMPUTE_32F = 68, /* float - default */ + CUBLAS_COMPUTE_32F_PEDANTIC = 69, /* float - pedantic */ + CUBLAS_COMPUTE_32F_FAST_16F = 74, /* float - fast, allows down-converting inputs to half or TF32 */ + CUBLAS_COMPUTE_32F_FAST_16BF = 75, /* float - fast, allows down-converting inputs to bfloat16 or TF32 */ + CUBLAS_COMPUTE_32F_FAST_TF32 = 77, /* float - fast, allows down-converting inputs to TF32 */ + CUBLAS_COMPUTE_64F = 70, /* double - default */ + CUBLAS_COMPUTE_64F_PEDANTIC = 71, /* double - pedantic */ + CUBLAS_COMPUTE_32I = 72, /* signed 32-bit int - default */ + CUBLAS_COMPUTE_32I_PEDANTIC = 73, /* signed 32-bit int - pedantic */ +} cublasComputeType_t; + +/* Opaque structure holding CUBLAS library context */ +struct cublasContext; +typedef struct cublasContext* cublasHandle_t; + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCreate_v2(cublasHandle_t* handle); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDestroy_v2(cublasHandle_t handle); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetVersion_v2(cublasHandle_t handle, int* version); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetProperty(libraryPropertyType type, int* value); +CUBLASAPI size_t CUBLASWINAPI cublasGetCudartVersion(void); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetWorkspace_v2(cublasHandle_t handle, + void* workspace, + size_t workspaceSizeInBytes); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetStream_v2(cublasHandle_t handle, cudaStream_t streamId); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetStream_v2(cublasHandle_t handle, cudaStream_t* streamId); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t* mode); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetPointerMode_v2(cublasHandle_t handle, cublasPointerMode_t mode); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t* mode); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetAtomicsMode(cublasHandle_t handle, cublasAtomicsMode_t mode); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetMathMode(cublasHandle_t handle, cublasMath_t* mode); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetMathMode(cublasHandle_t handle, cublasMath_t mode); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetSmCountTarget(cublasHandle_t handle, int* smCountTarget); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetSmCountTarget(cublasHandle_t handle, int smCountTarget); + +CUBLASAPI const char* CUBLASWINAPI cublasGetStatusName(cublasStatus_t status); +CUBLASAPI const char* CUBLASWINAPI cublasGetStatusString(cublasStatus_t status); + +/* Cublas logging */ +typedef void (*cublasLogCallback)(const char* msg); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasLoggerConfigure(int logIsOn, + int logToStdOut, + int logToStdErr, + const char* logFileName); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSetLoggerCallback(cublasLogCallback userCallback); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGetLoggerCallback(cublasLogCallback* userCallback); + +/* + * cublasStatus_t + * cublasSetVector (int n, int elemSize, const void *x, int incx, + * void *y, int incy) + * + * copies n elements from a vector x in CPU memory space to a vector y + * in GPU memory space. Elements in both vectors are assumed to have a + * size of elemSize bytes. Storage spacing between consecutive elements + * is incx for the source vector x and incy for the destination vector + * y. In general, y points to an object, or part of an object, allocated + * via cublasAlloc(). Column major format for two-dimensional matrices + * is assumed throughout CUBLAS. Therefore, if the increment for a vector + * is equal to 1, this access a column vector while using an increment + * equal to the leading dimension of the respective matrix accesses a + * row vector. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized + * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasSetVector(int n, int elemSize, const void* x, int incx, void* devicePtr, int incy); + +/* + * cublasStatus_t + * cublasGetVector (int n, int elemSize, const void *x, int incx, + * void *y, int incy) + * + * copies n elements from a vector x in GPU memory space to a vector y + * in CPU memory space. Elements in both vectors are assumed to have a + * size of elemSize bytes. Storage spacing between consecutive elements + * is incx for the source vector x and incy for the destination vector + * y. In general, x points to an object, or part of an object, allocated + * via cublasAlloc(). Column major format for two-dimensional matrices + * is assumed throughout CUBLAS. Therefore, if the increment for a vector + * is equal to 1, this access a column vector while using an increment + * equal to the leading dimension of the respective matrix accesses a + * row vector. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized + * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasGetVector(int n, int elemSize, const void* x, int incx, void* y, int incy); + +/* + * cublasStatus_t + * cublasSetMatrix (int rows, int cols, int elemSize, const void *A, + * int lda, void *B, int ldb) + * + * copies a tile of rows x cols elements from a matrix A in CPU memory + * space to a matrix B in GPU memory space. Each element requires storage + * of elemSize bytes. Both matrices are assumed to be stored in column + * major format, with the leading dimension (i.e. number of rows) of + * source matrix A provided in lda, and the leading dimension of matrix B + * provided in ldb. In general, B points to an object, or part of an + * object, that was allocated via cublasAlloc(). + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized + * CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or + * ldb <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasSetMatrix(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb); + +/* + * cublasStatus_t + * cublasGetMatrix (int rows, int cols, int elemSize, const void *A, + * int lda, void *B, int ldb) + * + * copies a tile of rows x cols elements from a matrix A in GPU memory + * space to a matrix B in CPU memory space. Each element requires storage + * of elemSize bytes. Both matrices are assumed to be stored in column + * major format, with the leading dimension (i.e. number of rows) of + * source matrix A provided in lda, and the leading dimension of matrix B + * provided in ldb. In general, A points to an object, or part of an + * object, that was allocated via cublasAlloc(). + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized + * CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasGetMatrix(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb); + +/* + * cublasStatus + * cublasSetVectorAsync ( int n, int elemSize, const void *x, int incx, + * void *y, int incy, cudaStream_t stream ); + * + * cublasSetVectorAsync has the same functionnality as cublasSetVector + * but the transfer is done asynchronously within the CUDA stream passed + * in parameter. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized + * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasSetVectorAsync( + int n, int elemSize, const void* hostPtr, int incx, void* devicePtr, int incy, cudaStream_t stream); +/* + * cublasStatus + * cublasGetVectorAsync( int n, int elemSize, const void *x, int incx, + * void *y, int incy, cudaStream_t stream) + * + * cublasGetVectorAsync has the same functionnality as cublasGetVector + * but the transfer is done asynchronously within the CUDA stream passed + * in parameter. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library not been initialized + * CUBLAS_STATUS_INVALID_VALUE if incx, incy, or elemSize <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if an error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI cublasGetVectorAsync( + int n, int elemSize, const void* devicePtr, int incx, void* hostPtr, int incy, cudaStream_t stream); + +/* + * cublasStatus_t + * cublasSetMatrixAsync (int rows, int cols, int elemSize, const void *A, + * int lda, void *B, int ldb, cudaStream_t stream) + * + * cublasSetMatrixAsync has the same functionnality as cublasSetMatrix + * but the transfer is done asynchronously within the CUDA stream passed + * in parameter. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized + * CUBLAS_STATUS_INVALID_VALUE if rows or cols < 0, or elemSize, lda, or + * ldb <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI +cublasSetMatrixAsync(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb, cudaStream_t stream); + +/* + * cublasStatus_t + * cublasGetMatrixAsync (int rows, int cols, int elemSize, const void *A, + * int lda, void *B, int ldb, cudaStream_t stream) + * + * cublasGetMatrixAsync has the same functionnality as cublasGetMatrix + * but the transfer is done asynchronously within the CUDA stream passed + * in parameter. + * + * Return Values + * ------------- + * CUBLAS_STATUS_NOT_INITIALIZED if CUBLAS library has not been initialized + * CUBLAS_STATUS_INVALID_VALUE if rows, cols, eleSize, lda, or ldb <= 0 + * CUBLAS_STATUS_MAPPING_ERROR if error occurred accessing GPU memory + * CUBLAS_STATUS_SUCCESS if the operation completed successfully + */ +cublasStatus_t CUBLASWINAPI +cublasGetMatrixAsync(int rows, int cols, int elemSize, const void* A, int lda, void* B, int ldb, cudaStream_t stream); + +CUBLASAPI void CUBLASWINAPI cublasXerbla(const char* srName, int info); +/* ---------------- CUBLAS BLAS1 functions ---------------- */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasNrm2Ex(cublasHandle_t handle, + int n, + const void* x, + cudaDataType xType, + int incx, + void* result, + cudaDataType resultType, + cudaDataType executionType); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasSnrm2_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasDnrm2_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasScnrm2_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDznrm2_v2( + cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotEx(cublasHandle_t handle, + int n, + const void* x, + cudaDataType xType, + int incx, + const void* y, + cudaDataType yType, + int incy, + void* result, + cudaDataType resultType, + cudaDataType executionType); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDotcEx(cublasHandle_t handle, + int n, + const void* x, + cudaDataType xType, + int incx, + const void* y, + cudaDataType yType, + int incy, + void* result, + cudaDataType resultType, + cudaDataType executionType); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdot_v2(cublasHandle_t handle, + int n, + const float* x, + int incx, + const float* y, + int incy, + float* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdot_v2(cublasHandle_t handle, + int n, + const double* x, + int incx, + const double* y, + int incy, + double* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotu_v2(cublasHandle_t handle, + int n, + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdotc_v2(cublasHandle_t handle, + int n, + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotu_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdotc_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasScalEx(cublasHandle_t handle, + int n, + const void* alpha, /* host or device pointer */ + cudaDataType alphaType, + void* x, + cudaDataType xType, + int incx, + cudaDataType executionType); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSscal_v2(cublasHandle_t handle, + int n, + const float* alpha, /* host or device pointer */ + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDscal_v2(cublasHandle_t handle, + int n, + const double* alpha, /* host or device pointer */ + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCscal_v2(cublasHandle_t handle, + int n, + const cuComplex* alpha, /* host or device pointer */ + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsscal_v2(cublasHandle_t handle, + int n, + const float* alpha, /* host or device pointer */ + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZscal_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + cuDoubleComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdscal_v2(cublasHandle_t handle, + int n, + const double* alpha, /* host or device pointer */ + cuDoubleComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAxpyEx(cublasHandle_t handle, + int n, + const void* alpha, /* host or device pointer */ + cudaDataType alphaType, + const void* x, + cudaDataType xType, + int incx, + void* y, + cudaDataType yType, + int incy, + cudaDataType executiontype); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSaxpy_v2(cublasHandle_t handle, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + float* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDaxpy_v2(cublasHandle_t handle, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + double* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCaxpy_v2(cublasHandle_t handle, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZaxpy_v2(cublasHandle_t handle, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + cuDoubleComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCopyEx( + cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, void* y, cudaDataType yType, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasScopy_v2(cublasHandle_t handle, int n, const float* x, int incx, float* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasDcopy_v2(cublasHandle_t handle, int n, const double* x, int incx, double* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasCcopy_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, cuComplex* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasZcopy_v2(cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasSswap_v2(cublasHandle_t handle, int n, float* x, int incx, float* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasDswap_v2(cublasHandle_t handle, int n, double* x, int incx, double* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasCswap_v2(cublasHandle_t handle, int n, cuComplex* x, int incx, cuComplex* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasZswap_v2(cublasHandle_t handle, int n, cuDoubleComplex* x, int incx, cuDoubleComplex* y, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSwapEx( + cublasHandle_t handle, int n, void* x, cudaDataType xType, int incx, void* y, cudaDataType yType, int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasIsamax_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasIdamax_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasIcamax_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIzamax_v2( + cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIamaxEx( + cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result /* host or device pointer */ +); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasIsamin_v2(cublasHandle_t handle, int n, const float* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasIdamin_v2(cublasHandle_t handle, int n, const double* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasIcamin_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIzamin_v2( + cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, int* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasIaminEx( + cublasHandle_t handle, int n, const void* x, cudaDataType xType, int incx, int* result /* host or device pointer */ +); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasAsumEx(cublasHandle_t handle, + int n, + const void* x, + cudaDataType xType, + int incx, + void* result, + cudaDataType resultType, /* host or device pointer */ + cudaDataType executiontype); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasSasum_v2(cublasHandle_t handle, int n, const float* x, int incx, float* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasDasum_v2(cublasHandle_t handle, int n, const double* x, int incx, double* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasScasum_v2(cublasHandle_t handle, int n, const cuComplex* x, int incx, float* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDzasum_v2( + cublasHandle_t handle, int n, const cuDoubleComplex* x, int incx, double* result); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrot_v2(cublasHandle_t handle, + int n, + float* x, + int incx, + float* y, + int incy, + const float* c, /* host or device pointer */ + const float* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrot_v2(cublasHandle_t handle, + int n, + double* x, + int incx, + double* y, + int incy, + const double* c, /* host or device pointer */ + const double* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrot_v2(cublasHandle_t handle, + int n, + cuComplex* x, + int incx, + cuComplex* y, + int incy, + const float* c, /* host or device pointer */ + const cuComplex* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsrot_v2(cublasHandle_t handle, + int n, + cuComplex* x, + int incx, + cuComplex* y, + int incy, + const float* c, /* host or device pointer */ + const float* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrot_v2(cublasHandle_t handle, + int n, + cuDoubleComplex* x, + int incx, + cuDoubleComplex* y, + int incy, + const double* c, /* host or device pointer */ + const cuDoubleComplex* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdrot_v2(cublasHandle_t handle, + int n, + cuDoubleComplex* x, + int incx, + cuDoubleComplex* y, + int incy, + const double* c, /* host or device pointer */ + const double* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotEx(cublasHandle_t handle, + int n, + void* x, + cudaDataType xType, + int incx, + void* y, + cudaDataType yType, + int incy, + const void* c, /* host or device pointer */ + const void* s, + cudaDataType csType, + cudaDataType executiontype); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotg_v2(cublasHandle_t handle, + float* a, /* host or device pointer */ + float* b, /* host or device pointer */ + float* c, /* host or device pointer */ + float* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotg_v2(cublasHandle_t handle, + double* a, /* host or device pointer */ + double* b, /* host or device pointer */ + double* c, /* host or device pointer */ + double* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCrotg_v2(cublasHandle_t handle, + cuComplex* a, /* host or device pointer */ + cuComplex* b, /* host or device pointer */ + float* c, /* host or device pointer */ + cuComplex* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZrotg_v2(cublasHandle_t handle, + cuDoubleComplex* a, /* host or device pointer */ + cuDoubleComplex* b, /* host or device pointer */ + double* c, /* host or device pointer */ + cuDoubleComplex* s); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotgEx(cublasHandle_t handle, + void* a, /* host or device pointer */ + void* b, /* host or device pointer */ + cudaDataType abType, + void* c, /* host or device pointer */ + void* s, /* host or device pointer */ + cudaDataType csType, + cudaDataType executiontype); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotm_v2(cublasHandle_t handle, + int n, + float* x, + int incx, + float* y, + int incy, + const float* param); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotm_v2(cublasHandle_t handle, + int n, + double* x, + int incx, + double* y, + int incy, + const double* param); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotmEx(cublasHandle_t handle, + int n, + void* x, + cudaDataType xType, + int incx, + void* y, + cudaDataType yType, + int incy, + const void* param, /* host or device pointer */ + cudaDataType paramType, + cudaDataType executiontype); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSrotmg_v2(cublasHandle_t handle, + float* d1, /* host or device pointer */ + float* d2, /* host or device pointer */ + float* x1, /* host or device pointer */ + const float* y1, /* host or device pointer */ + float* param); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDrotmg_v2(cublasHandle_t handle, + double* d1, /* host or device pointer */ + double* d2, /* host or device pointer */ + double* x1, /* host or device pointer */ + const double* y1, /* host or device pointer */ + double* param); /* host or device pointer */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasRotmgEx(cublasHandle_t handle, + void* d1, /* host or device pointer */ + cudaDataType d1Type, + void* d2, /* host or device pointer */ + cudaDataType d2Type, + void* x1, /* host or device pointer */ + cudaDataType x1Type, + const void* y1, /* host or device pointer */ + cudaDataType y1Type, + void* param, /* host or device pointer */ + cudaDataType paramType, + cudaDataType executiontype); +/* --------------- CUBLAS BLAS2 functions ---------------- */ + +/* GEMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* x, + int incx, + const float* beta, /* host or device pointer */ + float* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* x, + int incx, + const double* beta, /* host or device pointer */ + double* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy); +/* GBMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgbmv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int kl, + int ku, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* x, + int incx, + const float* beta, /* host or device pointer */ + float* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgbmv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int kl, + int ku, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* x, + int incx, + const double* beta, /* host or device pointer */ + double* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgbmv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int kl, + int ku, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgbmv_v2(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int kl, + int ku, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy); + +/* TRMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const float* A, + int lda, + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const double* A, + int lda, + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuComplex* A, + int lda, + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuDoubleComplex* A, + int lda, + cuDoubleComplex* x, + int incx); + +/* TBMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const float* A, + int lda, + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const double* A, + int lda, + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const cuComplex* A, + int lda, + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const cuDoubleComplex* A, + int lda, + cuDoubleComplex* x, + int incx); + +/* TPMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const float* AP, + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const double* AP, + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuComplex* AP, + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuDoubleComplex* AP, + cuDoubleComplex* x, + int incx); + +/* TRSV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const float* A, + int lda, + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const double* A, + int lda, + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuComplex* A, + int lda, + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuDoubleComplex* A, + int lda, + cuDoubleComplex* x, + int incx); + +/* TPSV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStpsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const float* AP, + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtpsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const double* AP, + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtpsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuComplex* AP, + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + const cuDoubleComplex* AP, + cuDoubleComplex* x, + int incx); +/* TBSV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStbsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const float* A, + int lda, + float* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtbsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const double* A, + int lda, + double* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtbsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const cuComplex* A, + int lda, + cuComplex* x, + int incx); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtbsv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int n, + int k, + const cuDoubleComplex* A, + int lda, + cuDoubleComplex* x, + int incx); + +/* SYMV/HEMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* x, + int incx, + const float* beta, /* host or device pointer */ + float* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* x, + int incx, + const double* beta, /* host or device pointer */ + double* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy); + +/* SBMV/HBMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* x, + int incx, + const float* beta, /* host or device pointer */ + float* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* x, + int incx, + const double* beta, /* host or device pointer */ + double* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhbmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy); + +/* SPMV/HPMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const float* AP, + const float* x, + int incx, + const float* beta, /* host or device pointer */ + float* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const double* AP, + const double* x, + int incx, + const double* beta, /* host or device pointer */ + double* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* AP, + const cuComplex* x, + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpmv_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* AP, + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy); + +/* GER */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSger_v2(cublasHandle_t handle, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + const float* y, + int incy, + float* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDger_v2(cublasHandle_t handle, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + const double* y, + int incy, + double* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeru_v2(cublasHandle_t handle, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgerc_v2(cublasHandle_t handle, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeru_v2(cublasHandle_t handle, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgerc_v2(cublasHandle_t handle, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* A, + int lda); + +/* SYR/HER */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + float* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + double* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + cuComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + cuDoubleComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + cuComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + cuDoubleComplex* A, + int lda); + +/* SPR/HPR */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + float* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + double* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + cuComplex* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + cuDoubleComplex* AP); + +/* SYR2/HER2 */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + const float* y, + int incy, + float* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + const double* y, + int incy, + double* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* A, + int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* A, + int lda); + +/* SPR2/HPR2 */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSspr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const float* alpha, /* host or device pointer */ + const float* x, + int incx, + const float* y, + int incy, + float* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDspr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const double* alpha, /* host or device pointer */ + const double* x, + int incx, + const double* y, + int incy, + double* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChpr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* x, + int incx, + const cuComplex* y, + int incy, + cuComplex* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhpr2_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* x, + int incx, + const cuDoubleComplex* y, + int incy, + cuDoubleComplex* AP); +/* BATCH GEMV */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* const Aarray[], + int lda, + const float* const xarray[], + int incx, + const float* beta, /* host or device pointer */ + float* const yarray[], + int incy, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* const Aarray[], + int lda, + const double* const xarray[], + int incx, + const double* beta, /* host or device pointer */ + double* const yarray[], + int incy, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* const Aarray[], + int lda, + const cuComplex* const xarray[], + int incx, + const cuComplex* beta, /* host or device pointer */ + cuComplex* const yarray[], + int incy, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* const Aarray[], + int lda, + const cuDoubleComplex* const xarray[], + int incx, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* const yarray[], + int incy, + int batchCount); + +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __half* const Aarray[], + int lda, + const __half* const xarray[], + int incx, + const float* beta, /* host or device pointer */ + __half* const yarray[], + int incy, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __half* const Aarray[], + int lda, + const __half* const xarray[], + int incx, + const float* beta, /* host or device pointer */ + float* const yarray[], + int incy, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __nv_bfloat16* const Aarray[], + int lda, + const __nv_bfloat16* const xarray[], + int incx, + const float* beta, /* host or device pointer */ + __nv_bfloat16* const yarray[], + int incy, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __nv_bfloat16* const Aarray[], + int lda, + const __nv_bfloat16* const xarray[], + int incx, + const float* beta, /* host or device pointer */ + float* const yarray[], + int incy, + int batchCount); +#endif + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + long long int strideA, /* purposely signed */ + const float* x, + int incx, + long long int stridex, + const float* beta, /* host or device pointer */ + float* y, + int incy, + long long int stridey, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + long long int strideA, /* purposely signed */ + const double* x, + int incx, + long long int stridex, + const double* beta, /* host or device pointer */ + double* y, + int incy, + long long int stridey, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + long long int strideA, /* purposely signed */ + const cuComplex* x, + int incx, + long long int stridex, + const cuComplex* beta, /* host or device pointer */ + cuComplex* y, + int incy, + long long int stridey, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasZgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + long long int strideA, /* purposely signed */ + const cuDoubleComplex* x, + int incx, + long long int stridex, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* y, + int incy, + long long int stridey, + int batchCount); + +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSHgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __half* A, + int lda, + long long int strideA, /* purposely signed */ + const __half* x, + int incx, + long long int stridex, + const float* beta, /* host or device pointer */ + __half* y, + int incy, + long long int stridey, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHSSgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __half* A, + int lda, + long long int strideA, /* purposely signed */ + const __half* x, + int incx, + long long int stridex, + const float* beta, /* host or device pointer */ + float* y, + int incy, + long long int stridey, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSTgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __nv_bfloat16* A, + int lda, + long long int strideA, /* purposely signed */ + const __nv_bfloat16* x, + int incx, + long long int stridex, + const float* beta, /* host or device pointer */ + __nv_bfloat16* y, + int incy, + long long int stridey, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasTSSgemvStridedBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + const float* alpha, /* host or device pointer */ + const __nv_bfloat16* A, + int lda, + long long int strideA, /* purposely signed */ + const __nv_bfloat16* x, + int incx, + long long int stridex, + const float* beta, /* host or device pointer */ + float* y, + int incy, + long long int stridey, + int batchCount); +#endif +/* ---------------- CUBLAS BLAS3 functions ---------------- */ + +/* GEMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemm_v2(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, /* host or device pointer */ + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemm_v2(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, /* host or device pointer */ + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm_v2(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3m(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, + const void* A, + cudaDataType Atype, + int lda, + const void* B, + cudaDataType Btype, + int ldb, + const cuComplex* beta, + void* C, + cudaDataType Ctype, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm_v2(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemm3m(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); + +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemm(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const __half* alpha, /* host or device pointer */ + const __half* A, + int lda, + const __half* B, + int ldb, + const __half* beta, /* host or device pointer */ + __half* C, + int ldc); +#endif +/* IO in FP16/FP32, computation in float */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + const void* B, + cudaDataType Btype, + int ldb, + const float* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + const void* B, + cudaDataType Btype, + int ldb, + const void* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc, + cublasComputeType_t computeType, + cublasGemmAlgo_t algo); + +/* IO in Int8 complex/cuComplex, computation in cuComplex */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, + const void* A, + cudaDataType Atype, + int lda, + const void* B, + cudaDataType Btype, + int ldb, + const cuComplex* beta, + void* C, + cudaDataType Ctype, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasUint8gemmBias(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + cublasOperation_t transc, + int m, + int n, + int k, + const unsigned char* A, + int A_bias, + int lda, + const unsigned char* B, + int B_bias, + int ldb, + unsigned char* C, + int C_bias, + int ldc, + int C_mult, + int C_shift); + +/* SYRK */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrk_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* beta, /* host or device pointer */ + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrk_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* beta, /* host or device pointer */ + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrk_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); +/* IO in Int8 complex/cuComplex, computation in cuComplex */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkEx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + const cuComplex* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc); + +/* IO in Int8 complex/cuComplex, computation in cuComplex, Gaussian math */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrk3mEx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, + const void* A, + cudaDataType Atype, + int lda, + const cuComplex* beta, + void* C, + cudaDataType Ctype, + int ldc); + +/* HERK */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const float* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherk_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const double* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); + +/* IO in Int8 complex/cuComplex, computation in cuComplex */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkEx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + const float* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc); + +/* IO in Int8 complex/cuComplex, computation in cuComplex, Gaussian math */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherk3mEx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, + const void* A, + cudaDataType Atype, + int lda, + const float* beta, + void* C, + cudaDataType Ctype, + int ldc); + +/* SYR2K */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyr2k_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, /* host or device pointer */ + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyr2k_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, /* host or device pointer */ + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyr2k_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyr2k_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); +/* HER2K */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCher2k_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const float* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZher2k_v2(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const double* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); +/* SYRKX : eXtended SYRK*/ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsyrkx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, /* host or device pointer */ + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsyrkx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, /* host or device pointer */ + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsyrkx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsyrkx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); +/* HERKX : eXtended HERK */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCherkx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const float* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZherkx(cublasHandle_t handle, + cublasFillMode_t uplo, + cublasOperation_t trans, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const double* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); +/* SYMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSsymm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* B, + int ldb, + const float* beta, /* host or device pointer */ + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDsymm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* B, + int ldb, + const double* beta, /* host or device pointer */ + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCsymm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZsymm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); + +/* HEMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasChemm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZhemm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* C, + int ldc); + +/* TRSM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + float* B, + int ldb); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + double* B, + int ldb); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + cuComplex* B, + int ldb); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + cuDoubleComplex* B, + int ldb); + +/* TRMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrmm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* B, + int ldb, + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrmm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* B, + int ldb, + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrmm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* B, + int ldb, + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrmm_v2(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* B, + int ldb, + cuDoubleComplex* C, + int ldc); +/* BATCH GEMM */ +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const __half* alpha, /* host or device pointer */ + const __half* const Aarray[], + int lda, + const __half* const Barray[], + int ldb, + const __half* beta, /* host or device pointer */ + __half* const Carray[], + int ldc, + int batchCount); +#endif +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* const Aarray[], + int lda, + const float* const Barray[], + int ldb, + const float* beta, /* host or device pointer */ + float* const Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* const Aarray[], + int lda, + const double* const Barray[], + int ldb, + const double* beta, /* host or device pointer */ + double* const Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* const Aarray[], + int lda, + const cuComplex* const Barray[], + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* const Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* const Aarray[], + int lda, + const cuComplex* const Barray[], + int ldb, + const cuComplex* beta, /* host or device pointer */ + cuComplex* const Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgemmBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* const Aarray[], + int lda, + const cuDoubleComplex* const Barray[], + int ldb, + const cuDoubleComplex* beta, /* host or device pointer */ + cuDoubleComplex* const Carray[], + int ldc, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmBatchedEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void* alpha, /* host or device pointer */ + const void* const Aarray[], + cudaDataType Atype, + int lda, + const void* const Barray[], + cudaDataType Btype, + int ldb, + const void* beta, /* host or device pointer */ + void* const Carray[], + cudaDataType Ctype, + int ldc, + int batchCount, + cublasComputeType_t computeType, + cublasGemmAlgo_t algo); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasGemmStridedBatchedEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + long long int strideA, /* purposely signed */ + const void* B, + cudaDataType Btype, + int ldb, + long long int strideB, + const void* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc, + long long int strideC, + int batchCount, + cublasComputeType_t computeType, + cublasGemmAlgo_t algo); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + long long int strideA, /* purposely signed */ + const float* B, + int ldb, + long long int strideB, + const float* beta, /* host or device pointer */ + float* C, + int ldc, + long long int strideC, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + long long int strideA, /* purposely signed */ + const double* B, + int ldb, + long long int strideB, + const double* beta, /* host or device pointer */ + double* C, + int ldc, + long long int strideC, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + long long int strideA, /* purposely signed */ + const cuComplex* B, + int ldb, + long long int strideB, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc, + long long int strideC, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgemm3mStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + long long int strideA, /* purposely signed */ + const cuComplex* B, + int ldb, + long long int strideB, + const cuComplex* beta, /* host or device pointer */ + cuComplex* C, + int ldc, + long long int strideC, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasZgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + long long int strideA, /* purposely signed */ + const cuDoubleComplex* B, + int ldb, + long long int strideB, + const cuDoubleComplex* beta, /* host or device poi */ + cuDoubleComplex* C, + int ldc, + long long int strideC, + int batchCount); + +#if defined(__cplusplus) +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasHgemmStridedBatched(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const __half* alpha, /* host or device pointer */ + const __half* A, + int lda, + long long int strideA, /* purposely signed */ + const __half* B, + int ldb, + long long int strideB, + const __half* beta, /* host or device pointer */ + __half* C, + int ldc, + long long int strideC, + int batchCount); +#endif +/* ---------------- CUBLAS BLAS-like extension ---------------- */ +/* GEAM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeam(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + const float* alpha, /* host or device pointer */ + const float* A, + int lda, + const float* beta, /* host or device pointer */ + const float* B, + int ldb, + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeam(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + const double* alpha, /* host or device pointer */ + const double* A, + int lda, + const double* beta, /* host or device pointer */ + const double* B, + int ldb, + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeam(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + const cuComplex* alpha, /* host or device pointer */ + const cuComplex* A, + int lda, + const cuComplex* beta, /* host or device pointer */ + const cuComplex* B, + int ldb, + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeam(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + const cuDoubleComplex* alpha, /* host or device pointer */ + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* beta, /* host or device pointer */ + const cuDoubleComplex* B, + int ldb, + cuDoubleComplex* C, + int ldc); + +/* Batched LU - GETRF*/ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrfBatched(cublasHandle_t handle, + int n, + float* const A[], /*Device pointer*/ + int lda, + int* P, /*Device Pointer*/ + int* info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrfBatched(cublasHandle_t handle, + int n, + double* const A[], /*Device pointer*/ + int lda, + int* P, /*Device Pointer*/ + int* info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrfBatched(cublasHandle_t handle, + int n, + cuComplex* const A[], /*Device pointer*/ + int lda, + int* P, /*Device Pointer*/ + int* info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrfBatched(cublasHandle_t handle, + int n, + cuDoubleComplex* const A[], /*Device pointer*/ + int lda, + int* P, /*Device Pointer*/ + int* info, /*Device Pointer*/ + int batchSize); + +/* Batched inversion based on LU factorization from getrf */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetriBatched(cublasHandle_t handle, + int n, + const float* const A[], /*Device pointer*/ + int lda, + const int* P, /*Device pointer*/ + float* const C[], /*Device pointer*/ + int ldc, + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetriBatched(cublasHandle_t handle, + int n, + const double* const A[], /*Device pointer*/ + int lda, + const int* P, /*Device pointer*/ + double* const C[], /*Device pointer*/ + int ldc, + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetriBatched(cublasHandle_t handle, + int n, + const cuComplex* const A[], /*Device pointer*/ + int lda, + const int* P, /*Device pointer*/ + cuComplex* const C[], /*Device pointer*/ + int ldc, + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetriBatched(cublasHandle_t handle, + int n, + const cuDoubleComplex* const A[], /*Device pointer*/ + int lda, + const int* P, /*Device pointer*/ + cuDoubleComplex* const C[], /*Device pointer*/ + int ldc, + int* info, + int batchSize); + +/* Batched solver based on LU factorization from getrf */ + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgetrsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int n, + int nrhs, + const float* const Aarray[], + int lda, + const int* devIpiv, + float* const Barray[], + int ldb, + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgetrsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int n, + int nrhs, + const double* const Aarray[], + int lda, + const int* devIpiv, + double* const Barray[], + int ldb, + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgetrsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int n, + int nrhs, + const cuComplex* const Aarray[], + int lda, + const int* devIpiv, + cuComplex* const Barray[], + int ldb, + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgetrsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int n, + int nrhs, + const cuDoubleComplex* const Aarray[], + int lda, + const int* devIpiv, + cuDoubleComplex* const Barray[], + int ldb, + int* info, + int batchSize); + +/* TRSM - Batched Triangular Solver */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasStrsmBatched(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const float* alpha, /*Host or Device Pointer*/ + const float* const A[], + int lda, + float* const B[], + int ldb, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDtrsmBatched(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const double* alpha, /*Host or Device Pointer*/ + const double* const A[], + int lda, + double* const B[], + int ldb, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCtrsmBatched(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuComplex* alpha, /*Host or Device Pointer*/ + const cuComplex* const A[], + int lda, + cuComplex* const B[], + int ldb, + int batchCount); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrsmBatched(cublasHandle_t handle, + cublasSideMode_t side, + cublasFillMode_t uplo, + cublasOperation_t trans, + cublasDiagType_t diag, + int m, + int n, + const cuDoubleComplex* alpha, /*Host or Device Pointer*/ + const cuDoubleComplex* const A[], + int lda, + cuDoubleComplex* const B[], + int ldb, + int batchCount); + +/* Batched - MATINV*/ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSmatinvBatched(cublasHandle_t handle, + int n, + const float* const A[], /*Device pointer*/ + int lda, + float* const Ainv[], /*Device pointer*/ + int lda_inv, + int* info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDmatinvBatched(cublasHandle_t handle, + int n, + const double* const A[], /*Device pointer*/ + int lda, + double* const Ainv[], /*Device pointer*/ + int lda_inv, + int* info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCmatinvBatched(cublasHandle_t handle, + int n, + const cuComplex* const A[], /*Device pointer*/ + int lda, + cuComplex* const Ainv[], /*Device pointer*/ + int lda_inv, + int* info, /*Device Pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZmatinvBatched(cublasHandle_t handle, + int n, + const cuDoubleComplex* const A[], /*Device pointer*/ + int lda, + cuDoubleComplex* const Ainv[], /*Device pointer*/ + int lda_inv, + int* info, /*Device Pointer*/ + int batchSize); + +/* Batch QR Factorization */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgeqrfBatched(cublasHandle_t handle, + int m, + int n, + float* const Aarray[], /*Device pointer*/ + int lda, + float* const TauArray[], /*Device pointer*/ + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgeqrfBatched(cublasHandle_t handle, + int m, + int n, + double* const Aarray[], /*Device pointer*/ + int lda, + double* const TauArray[], /*Device pointer*/ + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgeqrfBatched(cublasHandle_t handle, + int m, + int n, + cuComplex* const Aarray[], /*Device pointer*/ + int lda, + cuComplex* const TauArray[], /*Device pointer*/ + int* info, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgeqrfBatched(cublasHandle_t handle, + int m, + int n, + cuDoubleComplex* const Aarray[], /*Device pointer*/ + int lda, + cuDoubleComplex* const TauArray[], /*Device pointer*/ + int* info, + int batchSize); +/* Least Square Min only m >= n and Non-transpose supported */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSgelsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int nrhs, + float* const Aarray[], /*Device pointer*/ + int lda, + float* const Carray[], /*Device pointer*/ + int ldc, + int* info, + int* devInfoArray, /*Device pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDgelsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int nrhs, + double* const Aarray[], /*Device pointer*/ + int lda, + double* const Carray[], /*Device pointer*/ + int ldc, + int* info, + int* devInfoArray, /*Device pointer*/ + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCgelsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int nrhs, + cuComplex* const Aarray[], /*Device pointer*/ + int lda, + cuComplex* const Carray[], /*Device pointer*/ + int ldc, + int* info, + int* devInfoArray, + int batchSize); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZgelsBatched(cublasHandle_t handle, + cublasOperation_t trans, + int m, + int n, + int nrhs, + cuDoubleComplex* const Aarray[], /*Device pointer*/ + int lda, + cuDoubleComplex* const Carray[], /*Device pointer*/ + int ldc, + int* info, + int* devInfoArray, + int batchSize); +/* DGMM */ +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasSdgmm(cublasHandle_t handle, + cublasSideMode_t mode, + int m, + int n, + const float* A, + int lda, + const float* x, + int incx, + float* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasDdgmm(cublasHandle_t handle, + cublasSideMode_t mode, + int m, + int n, + const double* A, + int lda, + const double* x, + int incx, + double* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasCdgmm(cublasHandle_t handle, + cublasSideMode_t mode, + int m, + int n, + const cuComplex* A, + int lda, + const cuComplex* x, + int incx, + cuComplex* C, + int ldc); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZdgmm(cublasHandle_t handle, + cublasSideMode_t mode, + int m, + int n, + const cuDoubleComplex* A, + int lda, + const cuDoubleComplex* x, + int incx, + cuDoubleComplex* C, + int ldc); + +/* TPTTR : Triangular Pack format to Triangular format */ +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasStpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* AP, float* A, int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasDtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* AP, double* A, int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasCtpttr(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* AP, cuComplex* A, int lda); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtpttr( + cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* AP, cuDoubleComplex* A, int lda); +/* TRTTP : Triangular format to Triangular Pack format */ +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasStrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const float* A, int lda, float* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasDtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const double* A, int lda, double* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI +cublasCtrttp(cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuComplex* A, int lda, cuComplex* AP); + +CUBLASAPI cublasStatus_t CUBLASWINAPI cublasZtrttp( + cublasHandle_t handle, cublasFillMode_t uplo, int n, const cuDoubleComplex* A, int lda, cuDoubleComplex* AP); + +#if defined(__cplusplus) +} + +static inline cublasStatus_t cublasMigrateComputeType(cublasHandle_t handle, + cudaDataType_t dataType, + cublasComputeType_t* computeType) { + cublasMath_t mathMode = CUBLAS_DEFAULT_MATH; + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + + status = cublasGetMathMode(handle, &mathMode); + if (status != CUBLAS_STATUS_SUCCESS) { + return status; + } + + bool isPedantic = ((mathMode & 0xf) == CUBLAS_PEDANTIC_MATH); + + switch (dataType) { + case CUDA_R_32F: + case CUDA_C_32F: + *computeType = isPedantic ? CUBLAS_COMPUTE_32F_PEDANTIC : CUBLAS_COMPUTE_32F; + return CUBLAS_STATUS_SUCCESS; + case CUDA_R_64F: + case CUDA_C_64F: + *computeType = isPedantic ? CUBLAS_COMPUTE_64F_PEDANTIC : CUBLAS_COMPUTE_64F; + return CUBLAS_STATUS_SUCCESS; + case CUDA_R_16F: + *computeType = isPedantic ? CUBLAS_COMPUTE_16F_PEDANTIC : CUBLAS_COMPUTE_16F; + return CUBLAS_STATUS_SUCCESS; + case CUDA_R_32I: + *computeType = isPedantic ? CUBLAS_COMPUTE_32I_PEDANTIC : CUBLAS_COMPUTE_32I; + return CUBLAS_STATUS_SUCCESS; + default: + return CUBLAS_STATUS_NOT_SUPPORTED; + } +} +/* wrappers to accept old code with cudaDataType computeType when referenced from c++ code */ +static inline cublasStatus_t cublasGemmEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + const void* B, + cudaDataType Btype, + int ldb, + const void* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc, + cudaDataType computeType, + cublasGemmAlgo_t algo) { + cublasComputeType_t migratedComputeType = CUBLAS_COMPUTE_32F; + cublasStatus_t status = CUBLAS_STATUS_SUCCESS; + status = cublasMigrateComputeType(handle, computeType, &migratedComputeType); + if (status != CUBLAS_STATUS_SUCCESS) { + return status; + } + + return cublasGemmEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + migratedComputeType, + algo); +} + +static inline cublasStatus_t cublasGemmBatchedEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void* alpha, /* host or device pointer */ + const void* const Aarray[], + cudaDataType Atype, + int lda, + const void* const Barray[], + cudaDataType Btype, + int ldb, + const void* beta, /* host or device pointer */ + void* const Carray[], + cudaDataType Ctype, + int ldc, + int batchCount, + cudaDataType computeType, + cublasGemmAlgo_t algo) { + cublasComputeType_t migratedComputeType; + cublasStatus_t status; + status = cublasMigrateComputeType(handle, computeType, &migratedComputeType); + if (status != CUBLAS_STATUS_SUCCESS) { + return status; + } + + return cublasGemmBatchedEx(handle, + transa, + transb, + m, + n, + k, + alpha, + Aarray, + Atype, + lda, + Barray, + Btype, + ldb, + beta, + Carray, + Ctype, + ldc, + batchCount, + migratedComputeType, + algo); +} + +static inline cublasStatus_t cublasGemmStridedBatchedEx(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, + int m, + int n, + int k, + const void* alpha, /* host or device pointer */ + const void* A, + cudaDataType Atype, + int lda, + long long int strideA, /* purposely signed */ + const void* B, + cudaDataType Btype, + int ldb, + long long int strideB, + const void* beta, /* host or device pointer */ + void* C, + cudaDataType Ctype, + int ldc, + long long int strideC, + int batchCount, + cudaDataType computeType, + cublasGemmAlgo_t algo) { + cublasComputeType_t migratedComputeType; + cublasStatus_t status; + status = cublasMigrateComputeType(handle, computeType, &migratedComputeType); + if (status != CUBLAS_STATUS_SUCCESS) { + return status; + } + + return cublasGemmStridedBatchedEx(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + strideA, + B, + Btype, + ldb, + strideB, + beta, + C, + Ctype, + ldc, + strideC, + batchCount, + migratedComputeType, + algo); +} +#endif /* __cplusplus */ + +#endif /* !defined(CUBLAS_API_H_) */